HMLP: High-performance Machine Learning Primitives
intrin_wrapper.hpp
1 #ifndef _PVFMM_INTRIN_WRAPPER_HPP_
2 #define _PVFMM_INTRIN_WRAPPER_HPP_
3 
4 #include <pvfmm/math_utils.hpp>
5 #include <pvfmm/common.hpp>
6 #include <cstdint>
7 
8 #ifdef __SSE__
9 #include <xmmintrin.h>
10 #endif
11 #ifdef __SSE2__
12 #include <emmintrin.h>
13 #endif
14 #ifdef __SSE3__
15 #include <pmmintrin.h>
16 #endif
17 #ifdef __AVX__
18 #include <immintrin.h>
19 #endif
20 #if defined(__MIC__)
21 #include <immintrin.h>
22 #endif
23 
24 namespace pvfmm {
25 
26 template <class T> inline T zero_intrin() { return (T)0; }
27 
28 template <class T, class Real> inline T set_intrin(const Real& a) { return a; }
29 
30 template <class T, class Real> inline T load_intrin(Real const* a) { return a[0]; }
31 
32 template <class T, class Real> inline T bcast_intrin(Real const* a) { return a[0]; }
33 
34 template <class T, class Real> inline void store_intrin(Real* a, const T& b) { a[0] = b; }
35 
36 template <class T> inline T mul_intrin(const T& a, const T& b) { return a * b; }
37 
38 template <class T> inline T add_intrin(const T& a, const T& b) { return a + b; }
39 
40 template <class T> inline T sub_intrin(const T& a, const T& b) { return a - b; }
41 
42 template <class T> inline T cmplt_intrin(const T& a, const T& b) {
43  T r = 0;
44  uint8_t* r_ = reinterpret_cast<uint8_t*>(&r);
45  if (a < b)
46  for (int i = 0; i < sizeof(T); i++) r_[i] = ~(uint8_t)0;
47  return r;
48 }
49 
50 template <class T> inline T and_intrin(const T& a, const T& b) {
51  T r = 0;
52  const uint8_t* a_ = reinterpret_cast<const uint8_t*>(&a);
53  const uint8_t* b_ = reinterpret_cast<const uint8_t*>(&b);
54  uint8_t* r_ = reinterpret_cast<uint8_t*>(&r);
55  for (int i = 0; i < sizeof(T); i++) r_[i] = a_[i] & b_[i];
56  return r;
57 }
58 
59 template <class T> inline T rsqrt_approx_intrin(const T& r2) {
60  if (r2 != 0) return 1.0 / pvfmm::sqrt<T>(r2);
61  return 0;
62 }
63 
64 template <class T, class Real> inline void rsqrt_newton_intrin(T& rinv, const T& r2, const Real& nwtn_const) { rinv = rinv * (nwtn_const - r2 * rinv * rinv); }
65 
66 template <class T> inline T rsqrt_single_intrin(const T& r2) {
67  if (r2 != 0) return 1.0 / pvfmm::sqrt<T>(r2);
68  return 0;
69 }
70 
71 template <class T> inline T max_intrin(const T& a, const T& b) {
72  if (a > b)
73  return a;
74  else
75  return b;
76 }
77 
78 template <class T> inline T min_intrin(const T& a, const T& b) {
79  if (a > b)
80  return b;
81  else
82  return a;
83 }
84 
85 template <class T> inline T sin_intrin(const T& t) { return pvfmm::sin<T>(t); }
86 
87 template <class T> inline T cos_intrin(const T& t) { return pvfmm::cos<T>(t); }
88 
89 #ifdef __SSE3__
90 template <> inline __m128 zero_intrin() { return _mm_setzero_ps(); }
91 
92 template <> inline __m128d zero_intrin() { return _mm_setzero_pd(); }
93 
94 template <> inline __m128 set_intrin(const float& a) { return _mm_set_ps1(a); }
95 
96 template <> inline __m128d set_intrin(const double& a) { return _mm_set1_pd(a); }
97 
98 template <> inline __m128 load_intrin(float const* a) { return _mm_load_ps(a); }
99 
100 template <> inline __m128d load_intrin(double const* a) { return _mm_load_pd(a); }
101 
102 template <> inline __m128 bcast_intrin(float const* a) { return _mm_set_ps1(a[0]); }
103 
104 template <> inline __m128d bcast_intrin(double const* a) { return _mm_load_pd1(a); }
105 
106 template <> inline void store_intrin(float* a, const __m128& b) { return _mm_store_ps(a, b); }
107 
108 template <> inline void store_intrin(double* a, const __m128d& b) { return _mm_store_pd(a, b); }
109 
110 template <> inline __m128 mul_intrin(const __m128& a, const __m128& b) { return _mm_mul_ps(a, b); }
111 
112 template <> inline __m128d mul_intrin(const __m128d& a, const __m128d& b) { return _mm_mul_pd(a, b); }
113 
114 template <> inline __m128 add_intrin(const __m128& a, const __m128& b) { return _mm_add_ps(a, b); }
115 
116 template <> inline __m128d add_intrin(const __m128d& a, const __m128d& b) { return _mm_add_pd(a, b); }
117 
118 template <> inline __m128 sub_intrin(const __m128& a, const __m128& b) { return _mm_sub_ps(a, b); }
119 
120 template <> inline __m128d sub_intrin(const __m128d& a, const __m128d& b) { return _mm_sub_pd(a, b); }
121 
122 template <> inline __m128 cmplt_intrin(const __m128& a, const __m128& b) { return _mm_cmplt_ps(a, b); }
123 
124 template <> inline __m128d cmplt_intrin(const __m128d& a, const __m128d& b) { return _mm_cmplt_pd(a, b); }
125 
126 template <> inline __m128 and_intrin(const __m128& a, const __m128& b) { return _mm_and_ps(a, b); }
127 
128 template <> inline __m128d and_intrin(const __m128d& a, const __m128d& b) { return _mm_and_pd(a, b); }
129 
130 template <> inline __m128 rsqrt_approx_intrin(const __m128& r2) {
131  #define VEC_INTRIN __m128
132  #define RSQRT_INTRIN(a) _mm_rsqrt_ps(a)
133  #define CMPEQ_INTRIN(a, b) _mm_cmpeq_ps(a, b)
134  #define ANDNOT_INTRIN(a, b) _mm_andnot_ps(a, b)
135 
136  // Approx inverse square root which returns zero for r2=0
137  return ANDNOT_INTRIN(CMPEQ_INTRIN(r2, zero_intrin<VEC_INTRIN>()), RSQRT_INTRIN(r2));
138 
139  #undef VEC_INTRIN
140  #undef RSQRT_INTRIN
141  #undef CMPEQ_INTRIN
142  #undef ANDNOT_INTRIN
143 }
144 
145 template <> inline __m128d rsqrt_approx_intrin(const __m128d& r2) {
146  #define PD2PS(a) _mm_cvtpd_ps(a)
147  #define PS2PD(a) _mm_cvtps_pd(a)
148  return PS2PD(rsqrt_approx_intrin(PD2PS(r2)));
149  #undef PD2PS
150  #undef PS2PD
151 }
152 
153 template <> inline void rsqrt_newton_intrin(__m128& rinv, const __m128& r2, const float& nwtn_const) {
154  #define VEC_INTRIN __m128
155  // Newton iteration: rinv = 0.5 rinv_approx ( 3 - r2 rinv_approx^2 )
156  // We do not compute the product with 0.5 and this needs to be adjusted later
157  rinv = mul_intrin(rinv, sub_intrin(set_intrin<VEC_INTRIN>(nwtn_const), mul_intrin(r2, mul_intrin(rinv, rinv))));
158  #undef VEC_INTRIN
159 }
160 
161 template <> inline void rsqrt_newton_intrin(__m128d& rinv, const __m128d& r2, const double& nwtn_const) {
162  #define VEC_INTRIN __m128d
163  // Newton iteration: rinv = 0.5 rinv_approx ( 3 - r2 rinv_approx^2 )
164  // We do not compute the product with 0.5 and this needs to be adjusted later
165  rinv = mul_intrin(rinv, sub_intrin(set_intrin<VEC_INTRIN>(nwtn_const), mul_intrin(r2, mul_intrin(rinv, rinv))));
166  #undef VEC_INTRIN
167 }
168 
169 template <> inline __m128 rsqrt_single_intrin(const __m128& r2) {
170  #define VEC_INTRIN __m128
171  VEC_INTRIN rinv = rsqrt_approx_intrin(r2);
172  rsqrt_newton_intrin(rinv, r2, (float)3.0);
173  return rinv;
174  #undef VEC_INTRIN
175 }
176 
177 template <> inline __m128d rsqrt_single_intrin(const __m128d& r2) {
178  #define PD2PS(a) _mm_cvtpd_ps(a)
179  #define PS2PD(a) _mm_cvtps_pd(a)
180  return PS2PD(rsqrt_single_intrin(PD2PS(r2)));
181  #undef PD2PS
182  #undef PS2PD
183 }
184 
185 template <> inline __m128 max_intrin(const __m128& a, const __m128& b) { return _mm_max_ps(a, b); }
186 
187 template <> inline __m128d max_intrin(const __m128d& a, const __m128d& b) { return _mm_max_pd(a, b); }
188 
189 template <> inline __m128 min_intrin(const __m128& a, const __m128& b) { return _mm_min_ps(a, b); }
190 
191 template <> inline __m128d min_intrin(const __m128d& a, const __m128d& b) { return _mm_min_pd(a, b); }
192 
193 #ifdef PVFMM_HAVE_INTEL_SVML
194 template <> inline __m128 sin_intrin(const __m128& t) { return _mm_sin_ps(t); }
195 
196 template <> inline __m128 cos_intrin(const __m128& t) { return _mm_cos_ps(t); }
197 
198 template <> inline __m128d sin_intrin(const __m128d& t) { return _mm_sin_pd(t); }
199 
200 template <> inline __m128d cos_intrin(const __m128d& t) { return _mm_cos_pd(t); }
201 #else
202 template <> inline __m128 sin_intrin(const __m128& t_) {
203  union {
204  float e[4];
205  __m128 d;
206  } t;
207  store_intrin(t.e, t_);
208  return _mm_set_ps(pvfmm::sin<float>(t.e[3]), pvfmm::sin<float>(t.e[2]), pvfmm::sin<float>(t.e[1]), pvfmm::sin<float>(t.e[0]));
209 }
210 
211 template <> inline __m128 cos_intrin(const __m128& t_) {
212  union {
213  float e[4];
214  __m128 d;
215  } t;
216  store_intrin(t.e, t_);
217  return _mm_set_ps(pvfmm::cos<float>(t.e[3]), pvfmm::cos<float>(t.e[2]), pvfmm::cos<float>(t.e[1]), pvfmm::cos<float>(t.e[0]));
218 }
219 
220 template <> inline __m128d sin_intrin(const __m128d& t_) {
221  union {
222  double e[2];
223  __m128d d;
224  } t;
225  store_intrin(t.e, t_);
226  return _mm_set_pd(pvfmm::sin<double>(t.e[1]), pvfmm::sin<double>(t.e[0]));
227 }
228 
229 template <> inline __m128d cos_intrin(const __m128d& t_) {
230  union {
231  double e[2];
232  __m128d d;
233  } t;
234  store_intrin(t.e, t_);
235  return _mm_set_pd(pvfmm::cos<double>(t.e[1]), pvfmm::cos<double>(t.e[0]));
236 }
237 #endif
238 #endif
239 
240 #ifdef __AVX__
241 template <> inline __m256 zero_intrin() { return _mm256_setzero_ps(); }
242 
243 template <> inline __m256d zero_intrin() { return _mm256_setzero_pd(); }
244 
245 template <> inline __m256 set_intrin(const float& a) { return _mm256_set_ps(a, a, a, a, a, a, a, a); }
246 
247 template <> inline __m256d set_intrin(const double& a) { return _mm256_set_pd(a, a, a, a); }
248 
249 template <> inline __m256 load_intrin(float const* a) { return _mm256_load_ps(a); }
250 
251 template <> inline __m256d load_intrin(double const* a) { return _mm256_load_pd(a); }
252 
253 template <> inline __m256 bcast_intrin(float const* a) { return _mm256_broadcast_ss(a); }
254 
255 template <> inline __m256d bcast_intrin(double const* a) { return _mm256_broadcast_sd(a); }
256 
257 template <> inline void store_intrin(float* a, const __m256& b) { return _mm256_store_ps(a, b); }
258 
259 template <> inline void store_intrin(double* a, const __m256d& b) { return _mm256_store_pd(a, b); }
260 
261 template <> inline __m256 mul_intrin(const __m256& a, const __m256& b) { return _mm256_mul_ps(a, b); }
262 
263 template <> inline __m256d mul_intrin(const __m256d& a, const __m256d& b) { return _mm256_mul_pd(a, b); }
264 
265 template <> inline __m256 add_intrin(const __m256& a, const __m256& b) { return _mm256_add_ps(a, b); }
266 
267 template <> inline __m256d add_intrin(const __m256d& a, const __m256d& b) { return _mm256_add_pd(a, b); }
268 
269 template <> inline __m256 sub_intrin(const __m256& a, const __m256& b) { return _mm256_sub_ps(a, b); }
270 
271 template <> inline __m256d sub_intrin(const __m256d& a, const __m256d& b) { return _mm256_sub_pd(a, b); }
272 
273 template <> inline __m256 cmplt_intrin(const __m256& a, const __m256& b) { return _mm256_cmp_ps(a, b, _CMP_LT_OS); }
274 
275 template <> inline __m256d cmplt_intrin(const __m256d& a, const __m256d& b) { return _mm256_cmp_pd(a, b, _CMP_LT_OS); }
276 
277 template <> inline __m256 and_intrin(const __m256& a, const __m256& b) { return _mm256_and_ps(a, b); }
278 
279 template <> inline __m256d and_intrin(const __m256d& a, const __m256d& b) { return _mm256_and_pd(a, b); }
280 
281 template <> inline __m256 rsqrt_approx_intrin(const __m256& r2) {
282  #define VEC_INTRIN __m256
283  #define RSQRT_INTRIN(a) _mm256_rsqrt_ps(a)
284  #define CMPEQ_INTRIN(a, b) _mm256_cmp_ps(a, b, _CMP_EQ_OS)
285  #define ANDNOT_INTRIN(a, b) _mm256_andnot_ps(a, b)
286 
287  // Approx inverse square root which returns zero for r2=0
288  return ANDNOT_INTRIN(CMPEQ_INTRIN(r2, zero_intrin<VEC_INTRIN>()), RSQRT_INTRIN(r2));
289 
290  #undef VEC_INTRIN
291  #undef RSQRT_INTRIN
292  #undef CMPEQ_INTRIN
293  #undef ANDNOT_INTRIN
294 }
295 
296 template <> inline __m256d rsqrt_approx_intrin(const __m256d& r2) {
297  #define PD2PS(a) _mm256_cvtpd_ps(a)
298  #define PS2PD(a) _mm256_cvtps_pd(a)
299  return PS2PD(rsqrt_approx_intrin(PD2PS(r2)));
300  #undef PD2PS
301  #undef PS2PD
302 }
303 
304 template <> inline void rsqrt_newton_intrin(__m256& rinv, const __m256& r2, const float& nwtn_const) {
305  #define VEC_INTRIN __m256
306  // Newton iteration: rinv = 0.5 rinv_approx ( 3 - r2 rinv_approx^2 )
307  // We do not compute the product with 0.5 and this needs to be adjusted later
308  rinv = mul_intrin(rinv, sub_intrin(set_intrin<VEC_INTRIN>(nwtn_const), mul_intrin(r2, mul_intrin(rinv, rinv))));
309  #undef VEC_INTRIN
310 }
311 
312 template <> inline void rsqrt_newton_intrin(__m256d& rinv, const __m256d& r2, const double& nwtn_const) {
313  #define VEC_INTRIN __m256d
314  // Newton iteration: rinv = 0.5 rinv_approx ( 3 - r2 rinv_approx^2 )
315  // We do not compute the product with 0.5 and this needs to be adjusted later
316  rinv = mul_intrin(rinv, sub_intrin(set_intrin<VEC_INTRIN>(nwtn_const), mul_intrin(r2, mul_intrin(rinv, rinv))));
317  #undef VEC_INTRIN
318 }
319 
320 template <> inline __m256 rsqrt_single_intrin(const __m256& r2) {
321  #define VEC_INTRIN __m256
322  VEC_INTRIN rinv = rsqrt_approx_intrin(r2);
323  rsqrt_newton_intrin(rinv, r2, (float)3.0);
324  return rinv;
325  #undef VEC_INTRIN
326 }
327 
328 template <> inline __m256d rsqrt_single_intrin(const __m256d& r2) {
329  #define PD2PS(a) _mm256_cvtpd_ps(a)
330  #define PS2PD(a) _mm256_cvtps_pd(a)
331  return PS2PD(rsqrt_single_intrin(PD2PS(r2)));
332  #undef PD2PS
333  #undef PS2PD
334 }
335 
336 template <> inline __m256 max_intrin(const __m256& a, const __m256& b) { return _mm256_max_ps(a, b); }
337 
338 template <> inline __m256d max_intrin(const __m256d& a, const __m256d& b) { return _mm256_max_pd(a, b); }
339 
340 template <> inline __m256 min_intrin(const __m256& a, const __m256& b) { return _mm256_min_ps(a, b); }
341 
342 template <> inline __m256d min_intrin(const __m256d& a, const __m256d& b) { return _mm256_min_pd(a, b); }
343 
344 #ifdef PVFMM_HAVE_INTEL_SVML
345 template <> inline __m256 sin_intrin(const __m256& t) { return _mm256_sin_ps(t); }
346 
347 template <> inline __m256 cos_intrin(const __m256& t) { return _mm256_cos_ps(t); }
348 
349 template <> inline __m256d sin_intrin(const __m256d& t) { return _mm256_sin_pd(t); }
350 
351 template <> inline __m256d cos_intrin(const __m256d& t) { return _mm256_cos_pd(t); }
352 #else
353 template <> inline __m256 sin_intrin(const __m256& t_) {
354  union {
355  float e[8];
356  __m256 d;
357  } t;
358  store_intrin(t.e, t_); // t.d=t_;
359  return _mm256_set_ps(pvfmm::sin<float>(t.e[7]), pvfmm::sin<float>(t.e[6]), pvfmm::sin<float>(t.e[5]), pvfmm::sin<float>(t.e[4]), pvfmm::sin<float>(t.e[3]), pvfmm::sin<float>(t.e[2]), pvfmm::sin<float>(t.e[1]), pvfmm::sin<float>(t.e[0]));
360 }
361 
362 template <> inline __m256 cos_intrin(const __m256& t_) {
363  union {
364  float e[8];
365  __m256 d;
366  } t;
367  store_intrin(t.e, t_); // t.d=t_;
368  return _mm256_set_ps(pvfmm::cos<float>(t.e[7]), pvfmm::cos<float>(t.e[6]), pvfmm::cos<float>(t.e[5]), pvfmm::cos<float>(t.e[4]), pvfmm::cos<float>(t.e[3]), pvfmm::cos<float>(t.e[2]), pvfmm::cos<float>(t.e[1]), pvfmm::cos<float>(t.e[0]));
369 }
370 
371 template <> inline __m256d sin_intrin(const __m256d& t_) {
372  union {
373  double e[4];
374  __m256d d;
375  } t;
376  store_intrin(t.e, t_); // t.d=t_;
377  return _mm256_set_pd(pvfmm::sin<double>(t.e[3]), pvfmm::sin<double>(t.e[2]), pvfmm::sin<double>(t.e[1]), pvfmm::sin<double>(t.e[0]));
378 }
379 
380 template <> inline __m256d cos_intrin(const __m256d& t_) {
381  union {
382  double e[4];
383  __m256d d;
384  } t;
385  store_intrin(t.e, t_); // t.d=t_;
386  return _mm256_set_pd(pvfmm::cos<double>(t.e[3]), pvfmm::cos<double>(t.e[2]), pvfmm::cos<double>(t.e[1]), pvfmm::cos<double>(t.e[0]));
387 }
388 #endif
389 #endif
390 
391 template <class VEC, class Real> inline VEC rsqrt_intrin0(VEC r2) {
392  #define NWTN0 0
393  #define NWTN1 0
394  #define NWTN2 0
395  #define NWTN3 0
396 
397  // Real scal=1; Real const_nwtn0=3*scal*scal;
398  // scal=(NWTN0?2*scal*scal*scal:scal); Real const_nwtn1=3*scal*scal;
399  // scal=(NWTN1?2*scal*scal*scal:scal); Real const_nwtn2=3*scal*scal;
400  // scal=(NWTN2?2*scal*scal*scal:scal); Real const_nwtn3=3*scal*scal;
401 
402  VEC rinv;
403  #if NWTN0
404  rinv = rsqrt_single_intrin(r2);
405  #else
406  rinv = rsqrt_approx_intrin(r2);
407  #endif
408 
409  #if NWTN1
410  rsqrt_newton_intrin(rinv, r2, const_nwtn1);
411  #endif
412  #if NWTN2
413  rsqrt_newton_intrin(rinv, r2, const_nwtn2);
414  #endif
415  #if NWTN3
416  rsqrt_newton_intrin(rinv, r2, const_nwtn3);
417  #endif
418 
419  return rinv;
420 
421  #undef NWTN0
422  #undef NWTN1
423  #undef NWTN2
424  #undef NWTN3
425 }
426 
427 template <class VEC, class Real> inline VEC rsqrt_intrin1(VEC r2) {
428  #define NWTN0 0
429  #define NWTN1 1
430  #define NWTN2 0
431  #define NWTN3 0
432 
433  Real scal = 1; // Real const_nwtn0=3*scal*scal;
434  scal = (NWTN0 ? 2 * scal * scal * scal : scal);
435  Real const_nwtn1 = 3 * scal * scal;
436  // scal=(NWTN1?2*scal*scal*scal:scal); Real const_nwtn2=3*scal*scal;
437  // scal=(NWTN2?2*scal*scal*scal:scal); Real const_nwtn3=3*scal*scal;
438 
439  VEC rinv;
440  #if NWTN0
441  rinv = rsqrt_single_intrin(r2);
442  #else
443  rinv = rsqrt_approx_intrin(r2);
444  #endif
445 
446  #if NWTN1
447  rsqrt_newton_intrin(rinv, r2, const_nwtn1);
448  #endif
449  #if NWTN2
450  rsqrt_newton_intrin(rinv, r2, const_nwtn2);
451  #endif
452  #if NWTN3
453  rsqrt_newton_intrin(rinv, r2, const_nwtn3);
454  #endif
455 
456  return rinv;
457 
458  #undef NWTN0
459  #undef NWTN1
460  #undef NWTN2
461  #undef NWTN3
462 }
463 
464 template <class VEC, class Real> inline VEC rsqrt_intrin2(VEC r2) {
465  #define NWTN0 0
466  #define NWTN1 1
467  #define NWTN2 1
468  #define NWTN3 0
469 
470  Real scal = 1; // Real const_nwtn0=3*scal*scal;
471  scal = (NWTN0 ? 2 * scal * scal * scal : scal);
472  Real const_nwtn1 = 3 * scal * scal;
473  scal = (NWTN1 ? 2 * scal * scal * scal : scal);
474  Real const_nwtn2 = 3 * scal * scal;
475  // scal=(NWTN2?2*scal*scal*scal:scal); Real const_nwtn3=3*scal*scal;
476 
477  VEC rinv;
478  #if NWTN0
479  rinv = rsqrt_single_intrin(r2);
480  #else
481  rinv = rsqrt_approx_intrin(r2);
482  #endif
483 
484  #if NWTN1
485  rsqrt_newton_intrin(rinv, r2, const_nwtn1);
486  #endif
487  #if NWTN2
488  rsqrt_newton_intrin(rinv, r2, const_nwtn2);
489  #endif
490  #if NWTN3
491  rsqrt_newton_intrin(rinv, r2, const_nwtn3);
492  #endif
493 
494  return rinv;
495 
496  #undef NWTN0
497  #undef NWTN1
498  #undef NWTN2
499  #undef NWTN3
500 }
501 
502 template <class VEC, class Real> inline VEC rsqrt_intrin3(VEC r2) {
503  #define NWTN0 0
504  #define NWTN1 1
505  #define NWTN2 1
506  #define NWTN3 1
507 
508  Real scal = 1; // Real const_nwtn0=3*scal*scal;
509  scal = (NWTN0 ? 2 * scal * scal * scal : scal);
510  Real const_nwtn1 = 3 * scal * scal;
511  scal = (NWTN1 ? 2 * scal * scal * scal : scal);
512  Real const_nwtn2 = 3 * scal * scal;
513  scal = (NWTN2 ? 2 * scal * scal * scal : scal);
514  Real const_nwtn3 = 3 * scal * scal;
515 
516  VEC rinv;
517  #if NWTN0
518  rinv = rsqrt_single_intrin(r2);
519  #else
520  rinv = rsqrt_approx_intrin(r2);
521  #endif
522 
523  #if NWTN1
524  rsqrt_newton_intrin(rinv, r2, const_nwtn1);
525  #endif
526  #if NWTN2
527  rsqrt_newton_intrin(rinv, r2, const_nwtn2);
528  #endif
529  #if NWTN3
530  rsqrt_newton_intrin(rinv, r2, const_nwtn3);
531  #endif
532 
533  return rinv;
534 
535  #undef NWTN0
536  #undef NWTN1
537  #undef NWTN2
538  #undef NWTN3
539 }
540 }
541 
542 #endif //_PVFMM_INTRIN_WRAPPER_HPP_
Definition: cheb_utils.hpp:12