2 const double log2e = 1.4426950408889634073599;
3 const double maxlog = 7.09782712893383996843e2;
4 const double minlog = -7.08396418532264106224e2;
5 const double one = 1.0;
6 const double c1 = 6.93145751953125E-1;
7 const double c2 = 1.42860682030941723212E-6;
10 const double w11 = 3.5524625185478232665958141148891055719216674475023e-8;
11 const double w10 = 2.5535368519306500343384723775435166753084614063349e-7;
12 const double w9 = 2.77750562801295315877005242757916081614772210463065e-6;
13 const double w8 = 2.47868893393199945541176652007657202642495832996107e-5;
14 const double w7 = 1.98419213985637881240770890090795533564573406893163e-4;
15 const double w6 = 1.3888869684178659239014256260881685824525255547326e-3;
16 const double w5 = 8.3333337052009872221152811550156335074160546333973e-3;
17 const double w4 = 4.1666666621080810610346717440523105184720007971655e-2;
18 const double w3 = 0.166666666669960803484477734308515404418108830469798;
19 const double w2 = 0.499999999999877094481580370323249951329122224389189;
20 const double w1 = 1.0000000000000017952745258419615282194236357388884;
21 const double w0 = 0.99999999999999999566016490920259318691496540598896;
38 v4df_t a03_0, a03_1, a03_2, a03_3;
39 v4df_t a47_0, a47_1, a47_2, a47_3;
40 v4df_t p03_0, p03_1, p03_2, p03_3;
41 v4df_t p47_0, p47_1, p47_2, p47_3;
43 v4li_t k03_0, k03_1, k03_2, k03_3;
44 v4li_t k47_0, k47_1, k47_2, k47_3;
45 v4li_t offset, mask0, mask1023;
49 tmp.v = _mm256_broadcast_sd( &maxlog );
50 c03_0.v = _mm256_min_pd( tmp.v, c03_0.v );
51 c03_1.v = _mm256_min_pd( tmp.v, c03_1.v );
52 c03_2.v = _mm256_min_pd( tmp.v, c03_2.v );
53 c03_3.v = _mm256_min_pd( tmp.v, c03_3.v );
54 c47_0.v = _mm256_min_pd( tmp.v, c47_0.v );
55 c47_1.v = _mm256_min_pd( tmp.v, c47_1.v );
56 c47_2.v = _mm256_min_pd( tmp.v, c47_2.v );
57 c47_3.v = _mm256_min_pd( tmp.v, c47_3.v );
59 tmp.v = _mm256_broadcast_sd( &minlog );
60 c03_0.v = _mm256_max_pd( tmp.v, c03_0.v );
61 c03_1.v = _mm256_max_pd( tmp.v, c03_1.v );
62 c03_2.v = _mm256_max_pd( tmp.v, c03_2.v );
63 c03_3.v = _mm256_max_pd( tmp.v, c03_3.v );
64 c47_0.v = _mm256_max_pd( tmp.v, c47_0.v );
65 c47_1.v = _mm256_max_pd( tmp.v, c47_1.v );
66 c47_2.v = _mm256_max_pd( tmp.v, c47_2.v );
67 c47_3.v = _mm256_max_pd( tmp.v, c47_3.v );
71 l2e.v = _mm256_broadcast_sd( &log2e );
72 a03_0.v = _mm256_mul_pd( l2e.v, c03_0.v );
73 a03_1.v = _mm256_mul_pd( l2e.v, c03_1.v );
74 a03_2.v = _mm256_mul_pd( l2e.v, c03_2.v );
75 a03_3.v = _mm256_mul_pd( l2e.v, c03_3.v );
76 a47_0.v = _mm256_mul_pd( l2e.v, c47_0.v );
77 a47_1.v = _mm256_mul_pd( l2e.v, c47_1.v );
78 a47_2.v = _mm256_mul_pd( l2e.v, c47_2.v );
79 a47_3.v = _mm256_mul_pd( l2e.v, c47_3.v );
82 tmp.v = _mm256_setzero_pd();
83 p03_0.v = _mm256_cmp_pd( a03_0.v, tmp.v, 1 );
84 p03_1.v = _mm256_cmp_pd( a03_1.v, tmp.v, 1 );
85 p03_2.v = _mm256_cmp_pd( a03_2.v, tmp.v, 1 );
86 p03_3.v = _mm256_cmp_pd( a03_3.v, tmp.v, 1 );
87 p47_0.v = _mm256_cmp_pd( a47_0.v, tmp.v, 1 );
88 p47_1.v = _mm256_cmp_pd( a47_1.v, tmp.v, 1 );
89 p47_2.v = _mm256_cmp_pd( a47_2.v, tmp.v, 1 );
90 p47_3.v = _mm256_cmp_pd( a47_3.v, tmp.v, 1 );
91 tmp.v = _mm256_broadcast_sd( &one );
92 p03_0.v = _mm256_and_pd( tmp.v, p03_0.v );
93 p03_1.v = _mm256_and_pd( tmp.v, p03_1.v );
94 p03_2.v = _mm256_and_pd( tmp.v, p03_2.v );
95 p03_3.v = _mm256_and_pd( tmp.v, p03_3.v );
96 p47_0.v = _mm256_and_pd( tmp.v, p47_0.v );
97 p47_1.v = _mm256_and_pd( tmp.v, p47_1.v );
98 p47_2.v = _mm256_and_pd( tmp.v, p47_2.v );
99 p47_3.v = _mm256_and_pd( tmp.v, p47_3.v );
101 a03_0.v = _mm256_sub_pd( a03_0.v, p03_0.v );
102 a03_1.v = _mm256_sub_pd( a03_1.v, p03_1.v );
103 a03_2.v = _mm256_sub_pd( a03_2.v, p03_2.v );
104 a03_3.v = _mm256_sub_pd( a03_3.v, p03_3.v );
105 a47_0.v = _mm256_sub_pd( a47_0.v, p47_0.v );
106 a47_1.v = _mm256_sub_pd( a47_1.v, p47_1.v );
107 a47_2.v = _mm256_sub_pd( a47_2.v, p47_2.v );
108 a47_3.v = _mm256_sub_pd( a47_3.v, p47_3.v );
113 k03_0.v = _mm256_cvttpd_epi32( a03_0.v );
114 k03_1.v = _mm256_cvttpd_epi32( a03_1.v );
115 k03_2.v = _mm256_cvttpd_epi32( a03_2.v );
116 k03_3.v = _mm256_cvttpd_epi32( a03_3.v );
117 k47_0.v = _mm256_cvttpd_epi32( a47_0.v );
118 k47_1.v = _mm256_cvttpd_epi32( a47_1.v );
119 k47_2.v = _mm256_cvttpd_epi32( a47_2.v );
120 k47_3.v = _mm256_cvttpd_epi32( a47_3.v );
121 p03_0.v = _mm256_cvtepi32_pd( k03_0.v );
122 p03_1.v = _mm256_cvtepi32_pd( k03_1.v );
123 p03_2.v = _mm256_cvtepi32_pd( k03_2.v );
124 p03_3.v = _mm256_cvtepi32_pd( k03_3.v );
125 p47_0.v = _mm256_cvtepi32_pd( k47_0.v );
126 p47_1.v = _mm256_cvtepi32_pd( k47_1.v );
127 p47_2.v = _mm256_cvtepi32_pd( k47_2.v );
128 p47_3.v = _mm256_cvtepi32_pd( k47_3.v );
139 tmp.v = _mm256_broadcast_sd( &c1 );
140 a03_0.v = _mm256_mul_pd( tmp.v, p03_0.v );
141 a03_1.v = _mm256_mul_pd( tmp.v, p03_1.v );
142 a03_2.v = _mm256_mul_pd( tmp.v, p03_2.v );
143 a03_3.v = _mm256_mul_pd( tmp.v, p03_3.v );
144 a47_0.v = _mm256_mul_pd( tmp.v, p47_0.v );
145 a47_1.v = _mm256_mul_pd( tmp.v, p47_1.v );
146 a47_2.v = _mm256_mul_pd( tmp.v, p47_2.v );
147 a47_3.v = _mm256_mul_pd( tmp.v, p47_3.v );
148 c03_0.v = _mm256_sub_pd( c03_0.v, a03_0.v );
149 c03_1.v = _mm256_sub_pd( c03_1.v, a03_1.v );
150 c03_2.v = _mm256_sub_pd( c03_2.v, a03_2.v );
151 c03_3.v = _mm256_sub_pd( c03_3.v, a03_3.v );
152 c47_0.v = _mm256_sub_pd( c47_0.v, a47_0.v );
153 c47_1.v = _mm256_sub_pd( c47_1.v, a47_1.v );
154 c47_2.v = _mm256_sub_pd( c47_2.v, a47_2.v );
155 c47_3.v = _mm256_sub_pd( c47_3.v, a47_3.v );
156 tmp.v = _mm256_broadcast_sd( &c2 );
157 a03_0.v = _mm256_mul_pd( tmp.v, p03_0.v );
158 a03_1.v = _mm256_mul_pd( tmp.v, p03_1.v );
159 a03_2.v = _mm256_mul_pd( tmp.v, p03_2.v );
160 a03_3.v = _mm256_mul_pd( tmp.v, p03_3.v );
161 a47_0.v = _mm256_mul_pd( tmp.v, p47_0.v );
162 a47_1.v = _mm256_mul_pd( tmp.v, p47_1.v );
163 a47_2.v = _mm256_mul_pd( tmp.v, p47_2.v );
164 a47_3.v = _mm256_mul_pd( tmp.v, p47_3.v );
165 c03_0.v = _mm256_sub_pd( c03_0.v, a03_0.v );
166 c03_1.v = _mm256_sub_pd( c03_1.v, a03_1.v );
167 c03_2.v = _mm256_sub_pd( c03_2.v, a03_2.v );
168 c03_3.v = _mm256_sub_pd( c03_3.v, a03_3.v );
169 c47_0.v = _mm256_sub_pd( c47_0.v, a47_0.v );
170 c47_1.v = _mm256_sub_pd( c47_1.v, a47_1.v );
171 c47_2.v = _mm256_sub_pd( c47_2.v, a47_2.v );
172 c47_3.v = _mm256_sub_pd( c47_3.v, a47_3.v );
181 tmp.v = _mm256_broadcast_sd( &w11 );
183 a03_0.v = _mm256_mul_pd( c03_0.v, tmp.v );
184 a03_1.v = _mm256_mul_pd( c03_1.v, tmp.v );
185 a03_2.v = _mm256_mul_pd( c03_2.v, tmp.v );
186 a03_3.v = _mm256_mul_pd( c03_3.v, tmp.v );
187 a47_0.v = _mm256_mul_pd( c47_0.v, tmp.v );
188 a47_1.v = _mm256_mul_pd( c47_1.v, tmp.v );
189 a47_2.v = _mm256_mul_pd( c47_2.v, tmp.v );
190 a47_3.v = _mm256_mul_pd( c47_3.v, tmp.v );
191 tmp.v = _mm256_broadcast_sd( &w10 );
193 a03_0.v = _mm256_add_pd( a03_0.v, tmp.v );
194 a03_1.v = _mm256_add_pd( a03_1.v, tmp.v );
195 a03_2.v = _mm256_add_pd( a03_2.v, tmp.v );
196 a03_3.v = _mm256_add_pd( a03_3.v, tmp.v );
197 a47_0.v = _mm256_add_pd( a47_0.v, tmp.v );
198 a47_1.v = _mm256_add_pd( a47_1.v, tmp.v );
199 a47_2.v = _mm256_add_pd( a47_2.v, tmp.v );
200 a47_3.v = _mm256_add_pd( a47_3.v, tmp.v );
204 tmp.v = _mm256_broadcast_sd( &w9 );
205 a03_0.v = _mm256_mul_pd( a03_0.v, c03_0.v );
206 a03_1.v = _mm256_mul_pd( a03_1.v, c03_1.v );
207 a03_2.v = _mm256_mul_pd( a03_2.v, c03_2.v );
208 a03_3.v = _mm256_mul_pd( a03_3.v, c03_3.v );
209 a47_0.v = _mm256_mul_pd( a47_0.v, c47_0.v );
210 a47_1.v = _mm256_mul_pd( a47_1.v, c47_1.v );
211 a47_2.v = _mm256_mul_pd( a47_2.v, c47_2.v );
212 a47_3.v = _mm256_mul_pd( a47_3.v, c47_3.v );
213 a03_0.v = _mm256_add_pd( a03_0.v, tmp.v );
214 a03_1.v = _mm256_add_pd( a03_1.v, tmp.v );
215 a03_2.v = _mm256_add_pd( a03_2.v, tmp.v );
216 a03_3.v = _mm256_add_pd( a03_3.v, tmp.v );
217 a47_0.v = _mm256_add_pd( a47_0.v, tmp.v );
218 a47_1.v = _mm256_add_pd( a47_1.v, tmp.v );
219 a47_2.v = _mm256_add_pd( a47_2.v, tmp.v );
220 a47_3.v = _mm256_add_pd( a47_3.v, tmp.v );
221 tmp.v = _mm256_broadcast_sd( &w8 );
222 a03_0.v = _mm256_mul_pd( a03_0.v, c03_0.v );
223 a03_1.v = _mm256_mul_pd( a03_1.v, c03_1.v );
224 a03_2.v = _mm256_mul_pd( a03_2.v, c03_2.v );
225 a03_3.v = _mm256_mul_pd( a03_3.v, c03_3.v );
226 a47_0.v = _mm256_mul_pd( a47_0.v, c47_0.v );
227 a47_1.v = _mm256_mul_pd( a47_1.v, c47_1.v );
228 a47_2.v = _mm256_mul_pd( a47_2.v, c47_2.v );
229 a47_3.v = _mm256_mul_pd( a47_3.v, c47_3.v );
230 a03_0.v = _mm256_add_pd( a03_0.v, tmp.v );
231 a03_1.v = _mm256_add_pd( a03_1.v, tmp.v );
232 a03_2.v = _mm256_add_pd( a03_2.v, tmp.v );
233 a03_3.v = _mm256_add_pd( a03_3.v, tmp.v );
234 a47_0.v = _mm256_add_pd( a47_0.v, tmp.v );
235 a47_1.v = _mm256_add_pd( a47_1.v, tmp.v );
236 a47_2.v = _mm256_add_pd( a47_2.v, tmp.v );
237 a47_3.v = _mm256_add_pd( a47_3.v, tmp.v );
240 tmp.v = _mm256_broadcast_sd( &w7 );
241 a03_0.v = _mm256_mul_pd( a03_0.v, c03_0.v );
242 a03_1.v = _mm256_mul_pd( a03_1.v, c03_1.v );
243 a03_2.v = _mm256_mul_pd( a03_2.v, c03_2.v );
244 a03_3.v = _mm256_mul_pd( a03_3.v, c03_3.v );
245 a47_0.v = _mm256_mul_pd( a47_0.v, c47_0.v );
246 a47_1.v = _mm256_mul_pd( a47_1.v, c47_1.v );
247 a47_2.v = _mm256_mul_pd( a47_2.v, c47_2.v );
248 a47_3.v = _mm256_mul_pd( a47_3.v, c47_3.v );
249 a03_0.v = _mm256_add_pd( a03_0.v, tmp.v );
250 a03_1.v = _mm256_add_pd( a03_1.v, tmp.v );
251 a03_2.v = _mm256_add_pd( a03_2.v, tmp.v );
252 a03_3.v = _mm256_add_pd( a03_3.v, tmp.v );
253 a47_0.v = _mm256_add_pd( a47_0.v, tmp.v );
254 a47_1.v = _mm256_add_pd( a47_1.v, tmp.v );
255 a47_2.v = _mm256_add_pd( a47_2.v, tmp.v );
256 a47_3.v = _mm256_add_pd( a47_3.v, tmp.v );
257 tmp.v = _mm256_broadcast_sd( &w6 );
258 a03_0.v = _mm256_mul_pd( a03_0.v, c03_0.v );
259 a03_1.v = _mm256_mul_pd( a03_1.v, c03_1.v );
260 a03_2.v = _mm256_mul_pd( a03_2.v, c03_2.v );
261 a03_3.v = _mm256_mul_pd( a03_3.v, c03_3.v );
262 a47_0.v = _mm256_mul_pd( a47_0.v, c47_0.v );
263 a47_1.v = _mm256_mul_pd( a47_1.v, c47_1.v );
264 a47_2.v = _mm256_mul_pd( a47_2.v, c47_2.v );
265 a47_3.v = _mm256_mul_pd( a47_3.v, c47_3.v );
266 a03_0.v = _mm256_add_pd( a03_0.v, tmp.v );
267 a03_1.v = _mm256_add_pd( a03_1.v, tmp.v );
268 a03_2.v = _mm256_add_pd( a03_2.v, tmp.v );
269 a03_3.v = _mm256_add_pd( a03_3.v, tmp.v );
270 a47_0.v = _mm256_add_pd( a47_0.v, tmp.v );
271 a47_1.v = _mm256_add_pd( a47_1.v, tmp.v );
272 a47_2.v = _mm256_add_pd( a47_2.v, tmp.v );
273 a47_3.v = _mm256_add_pd( a47_3.v, tmp.v );
276 tmp.v = _mm256_broadcast_sd( &w5 );
277 a03_0.v = _mm256_mul_pd( a03_0.v, c03_0.v );
278 a03_1.v = _mm256_mul_pd( a03_1.v, c03_1.v );
279 a03_2.v = _mm256_mul_pd( a03_2.v, c03_2.v );
280 a03_3.v = _mm256_mul_pd( a03_3.v, c03_3.v );
281 a47_0.v = _mm256_mul_pd( a47_0.v, c47_0.v );
282 a47_1.v = _mm256_mul_pd( a47_1.v, c47_1.v );
283 a47_2.v = _mm256_mul_pd( a47_2.v, c47_2.v );
284 a47_3.v = _mm256_mul_pd( a47_3.v, c47_3.v );
285 a03_0.v = _mm256_add_pd( a03_0.v, tmp.v );
286 a03_1.v = _mm256_add_pd( a03_1.v, tmp.v );
287 a03_2.v = _mm256_add_pd( a03_2.v, tmp.v );
288 a03_3.v = _mm256_add_pd( a03_3.v, tmp.v );
289 a47_0.v = _mm256_add_pd( a47_0.v, tmp.v );
290 a47_1.v = _mm256_add_pd( a47_1.v, tmp.v );
291 a47_2.v = _mm256_add_pd( a47_2.v, tmp.v );
292 a47_3.v = _mm256_add_pd( a47_3.v, tmp.v );
293 tmp.v = _mm256_broadcast_sd( &w4 );
294 a03_0.v = _mm256_mul_pd( a03_0.v, c03_0.v );
295 a03_1.v = _mm256_mul_pd( a03_1.v, c03_1.v );
296 a03_2.v = _mm256_mul_pd( a03_2.v, c03_2.v );
297 a03_3.v = _mm256_mul_pd( a03_3.v, c03_3.v );
298 a47_0.v = _mm256_mul_pd( a47_0.v, c47_0.v );
299 a47_1.v = _mm256_mul_pd( a47_1.v, c47_1.v );
300 a47_2.v = _mm256_mul_pd( a47_2.v, c47_2.v );
301 a47_3.v = _mm256_mul_pd( a47_3.v, c47_3.v );
302 a03_0.v = _mm256_add_pd( a03_0.v, tmp.v );
303 a03_1.v = _mm256_add_pd( a03_1.v, tmp.v );
304 a03_2.v = _mm256_add_pd( a03_2.v, tmp.v );
305 a03_3.v = _mm256_add_pd( a03_3.v, tmp.v );
306 a47_0.v = _mm256_add_pd( a47_0.v, tmp.v );
307 a47_1.v = _mm256_add_pd( a47_1.v, tmp.v );
308 a47_2.v = _mm256_add_pd( a47_2.v, tmp.v );
309 a47_3.v = _mm256_add_pd( a47_3.v, tmp.v );
318 tmp.v = _mm256_broadcast_sd( &w3 );
319 a03_0.v = _mm256_mul_pd( a03_0.v, c03_0.v );
320 a03_1.v = _mm256_mul_pd( a03_1.v, c03_1.v );
321 a03_2.v = _mm256_mul_pd( a03_2.v, c03_2.v );
322 a03_3.v = _mm256_mul_pd( a03_3.v, c03_3.v );
323 a47_0.v = _mm256_mul_pd( a47_0.v, c47_0.v );
324 a47_1.v = _mm256_mul_pd( a47_1.v, c47_1.v );
325 a47_2.v = _mm256_mul_pd( a47_2.v, c47_2.v );
326 a47_3.v = _mm256_mul_pd( a47_3.v, c47_3.v );
327 a03_0.v = _mm256_add_pd( a03_0.v, tmp.v );
328 a03_1.v = _mm256_add_pd( a03_1.v, tmp.v );
329 a03_2.v = _mm256_add_pd( a03_2.v, tmp.v );
330 a03_3.v = _mm256_add_pd( a03_3.v, tmp.v );
331 a47_0.v = _mm256_add_pd( a47_0.v, tmp.v );
332 a47_1.v = _mm256_add_pd( a47_1.v, tmp.v );
333 a47_2.v = _mm256_add_pd( a47_2.v, tmp.v );
334 a47_3.v = _mm256_add_pd( a47_3.v, tmp.v );
335 tmp.v = _mm256_broadcast_sd( &w2 );
336 a03_0.v = _mm256_mul_pd( a03_0.v, c03_0.v );
337 a03_1.v = _mm256_mul_pd( a03_1.v, c03_1.v );
338 a03_2.v = _mm256_mul_pd( a03_2.v, c03_2.v );
339 a03_3.v = _mm256_mul_pd( a03_3.v, c03_3.v );
340 a47_0.v = _mm256_mul_pd( a47_0.v, c47_0.v );
341 a47_1.v = _mm256_mul_pd( a47_1.v, c47_1.v );
342 a47_2.v = _mm256_mul_pd( a47_2.v, c47_2.v );
343 a47_3.v = _mm256_mul_pd( a47_3.v, c47_3.v );
344 a03_0.v = _mm256_add_pd( a03_0.v, tmp.v );
345 a03_1.v = _mm256_add_pd( a03_1.v, tmp.v );
346 a03_2.v = _mm256_add_pd( a03_2.v, tmp.v );
347 a03_3.v = _mm256_add_pd( a03_3.v, tmp.v );
348 a47_0.v = _mm256_add_pd( a47_0.v, tmp.v );
349 a47_1.v = _mm256_add_pd( a47_1.v, tmp.v );
350 a47_2.v = _mm256_add_pd( a47_2.v, tmp.v );
351 a47_3.v = _mm256_add_pd( a47_3.v, tmp.v );
354 tmp.v = _mm256_broadcast_sd( &w1 );
355 a03_0.v = _mm256_mul_pd( a03_0.v, c03_0.v );
356 a03_1.v = _mm256_mul_pd( a03_1.v, c03_1.v );
357 a03_2.v = _mm256_mul_pd( a03_2.v, c03_2.v );
358 a03_3.v = _mm256_mul_pd( a03_3.v, c03_3.v );
359 a47_0.v = _mm256_mul_pd( a47_0.v, c47_0.v );
360 a47_1.v = _mm256_mul_pd( a47_1.v, c47_1.v );
361 a47_2.v = _mm256_mul_pd( a47_2.v, c47_2.v );
362 a47_3.v = _mm256_mul_pd( a47_3.v, c47_3.v );
363 a03_0.v = _mm256_add_pd( a03_0.v, tmp.v );
364 a03_1.v = _mm256_add_pd( a03_1.v, tmp.v );
365 a03_2.v = _mm256_add_pd( a03_2.v, tmp.v );
366 a03_3.v = _mm256_add_pd( a03_3.v, tmp.v );
367 a47_0.v = _mm256_add_pd( a47_0.v, tmp.v );
368 a47_1.v = _mm256_add_pd( a47_1.v, tmp.v );
369 a47_2.v = _mm256_add_pd( a47_2.v, tmp.v );
370 a47_3.v = _mm256_add_pd( a47_3.v, tmp.v );
371 tmp.v = _mm256_broadcast_sd( &w0 );
372 a03_0.v = _mm256_mul_pd( a03_0.v, c03_0.v );
373 a03_1.v = _mm256_mul_pd( a03_1.v, c03_1.v );
374 a03_2.v = _mm256_mul_pd( a03_2.v, c03_2.v );
375 a03_3.v = _mm256_mul_pd( a03_3.v, c03_3.v );
376 a47_0.v = _mm256_mul_pd( a47_0.v, c47_0.v );
377 a47_1.v = _mm256_mul_pd( a47_1.v, c47_1.v );
378 a47_2.v = _mm256_mul_pd( a47_2.v, c47_2.v );
379 a47_3.v = _mm256_mul_pd( a47_3.v, c47_3.v );
380 a03_0.v = _mm256_add_pd( a03_0.v, tmp.v );
381 a03_1.v = _mm256_add_pd( a03_1.v, tmp.v );
382 a03_2.v = _mm256_add_pd( a03_2.v, tmp.v );
383 a03_3.v = _mm256_add_pd( a03_3.v, tmp.v );
384 a47_0.v = _mm256_add_pd( a47_0.v, tmp.v );
385 a47_1.v = _mm256_add_pd( a47_1.v, tmp.v );
386 a47_2.v = _mm256_add_pd( a47_2.v, tmp.v );
387 a47_3.v = _mm256_add_pd( a47_3.v, tmp.v );
394 mask1023.v = _mm_setr_epi32( 1023, 1023, 1023, 1023 );
395 mask0.v = _mm_setr_epi32( 0, 0, 0, 0 );
396 offset.v = _mm_setr_epi32( 1023, 1023, 0, 0 );
403 k3.v = _mm_add_epi32( k03_0.v, mask1023.v );
406 k3.v = _mm_slli_epi32( k3.v, 20 );
409 k1.v = _mm_unpacklo_epi32( mask0.v, k3.v );
410 k2.v = _mm_unpackhi_epi32( mask0.v, k3.v );
414 p03_0.i = _mm256_insertf128_si256( p03_0.i, k1.v, 0 );
415 p03_0.i = _mm256_insertf128_si256( p03_0.i, k2.v, 1 );
423 k3.v = _mm_add_epi32( k03_1.v, mask1023.v );
426 k3.v = _mm_slli_epi32( k3.v, 20 );
429 k1.v = _mm_unpacklo_epi32( mask0.v, k3.v );
430 k2.v = _mm_unpackhi_epi32( mask0.v, k3.v );
434 p03_1.i = _mm256_insertf128_si256( p03_1.i, k1.v, 0 );
435 p03_1.i = _mm256_insertf128_si256( p03_1.i, k2.v, 1 );
442 k3.v = _mm_add_epi32( k03_2.v, mask1023.v );
445 k3.v = _mm_slli_epi32( k3.v, 20 );
448 k1.v = _mm_unpacklo_epi32( mask0.v, k3.v );
449 k2.v = _mm_unpackhi_epi32( mask0.v, k3.v );
453 p03_2.i = _mm256_insertf128_si256( p03_2.i, k1.v, 0 );
454 p03_2.i = _mm256_insertf128_si256( p03_2.i, k2.v, 1 );
461 k3.v = _mm_add_epi32( k03_3.v, mask1023.v );
464 k3.v = _mm_slli_epi32( k3.v, 20 );
467 k1.v = _mm_unpacklo_epi32( mask0.v, k3.v );
468 k2.v = _mm_unpackhi_epi32( mask0.v, k3.v );
472 p03_3.i = _mm256_insertf128_si256( p03_3.i, k1.v, 0 );
473 p03_3.i = _mm256_insertf128_si256( p03_3.i, k2.v, 1 );
480 k3.v = _mm_add_epi32( k47_0.v, mask1023.v );
483 k3.v = _mm_slli_epi32( k3.v, 20 );
486 k1.v = _mm_unpacklo_epi32( mask0.v, k3.v );
487 k2.v = _mm_unpackhi_epi32( mask0.v, k3.v );
491 p47_0.i = _mm256_insertf128_si256( p47_0.i, k1.v, 0 );
492 p47_0.i = _mm256_insertf128_si256( p47_0.i, k2.v, 1 );
498 k3.v = _mm_add_epi32( k47_1.v, mask1023.v );
501 k3.v = _mm_slli_epi32( k3.v, 20 );
504 k1.v = _mm_unpacklo_epi32( mask0.v, k3.v );
505 k2.v = _mm_unpackhi_epi32( mask0.v, k3.v );
509 p47_1.i = _mm256_insertf128_si256( p47_1.i, k1.v, 0 );
510 p47_1.i = _mm256_insertf128_si256( p47_1.i, k2.v, 1 );
516 k3.v = _mm_add_epi32( k47_2.v, mask1023.v );
519 k3.v = _mm_slli_epi32( k3.v, 20 );
522 k1.v = _mm_unpacklo_epi32( mask0.v, k3.v );
523 k2.v = _mm_unpackhi_epi32( mask0.v, k3.v );
527 p47_2.i = _mm256_insertf128_si256( p47_2.i, k1.v, 0 );
528 p47_2.i = _mm256_insertf128_si256( p47_2.i, k2.v, 1 );
534 k3.v = _mm_add_epi32( k47_3.v, mask1023.v );
537 k3.v = _mm_slli_epi32( k3.v, 20 );
540 k1.v = _mm_unpacklo_epi32( mask0.v, k3.v );
541 k2.v = _mm_unpackhi_epi32( mask0.v, k3.v );
545 p47_3.i = _mm256_insertf128_si256( p47_3.i, k1.v, 0 );
546 p47_3.i = _mm256_insertf128_si256( p47_3.i, k2.v, 1 );
553 c03_0.v = _mm256_mul_pd( a03_0.v, p03_0.v );
554 c03_1.v = _mm256_mul_pd( a03_1.v, p03_1.v );
555 c03_2.v = _mm256_mul_pd( a03_2.v, p03_2.v );
556 c03_3.v = _mm256_mul_pd( a03_3.v, p03_3.v );
557 c47_0.v = _mm256_mul_pd( a47_0.v, p47_0.v );
558 c47_1.v = _mm256_mul_pd( a47_1.v, p47_1.v );
559 c47_2.v = _mm256_mul_pd( a47_2.v, p47_2.v );
560 c47_3.v = _mm256_mul_pd( a47_3.v, p47_3.v );
Definition: avx_type.h:21
Definition: avx_type.h:13