HMLP: High-performance Machine Learning Primitives
kernel.hpp
1 #ifndef _PVFMM_KERNEL_HPP_
2 #define _PVFMM_KERNEL_HPP_
3 
4 #include <pvfmm/intrin_wrapper.hpp>
5 #include <pvfmm/mem_mgr.hpp>
6 #include <pvfmm/matrix.hpp>
7 #include <pvfmm/vector.hpp>
8 #include <pvfmm/common.hpp>
9 
10 #include <functional>
11 
12 namespace pvfmm {
13 
14 template <class ValueType, Integer DIM> class KernelFunction {
15 
16  public:
17  typedef void (KerFn)(const Vector<ValueType>& r_src, const Vector<ValueType>& n_src, const Vector<ValueType>& v_src, const Vector<ValueType>& r_trg, Vector<ValueType>& v_trg, const void* ctx);
18 
19  KernelFunction(KerFn& ker_, Integer d_src, Integer d_trg, std::string ker_name_, void* ctx_ = NULL, bool verbose = false) : ker(ker_), ctx(ctx_) {
20  ker_name = ker_name_ + std::to_string(sizeof(ValueType) * 8);
21  ker_dim[0] = d_src;
22  ker_dim[1] = d_trg;
23  scale_invar = true;
24  Init(verbose);
25  }
26 
27  virtual ~KernelFunction() {}
28 
29  virtual void operator()(const Vector<ValueType>& r_src, const Vector<ValueType>& n_src, const Vector<ValueType>& v_src, const Vector<ValueType>& r_trg, Vector<ValueType>& v_trg) const { ker(r_src, n_src, v_src, r_trg, v_trg, ctx); }
30 
31  std::string Name() const { return ker_name; }
32 
33  Integer Dim(Integer i) const { return ker_dim[i]; }
34 
35  bool ScaleInvariant() const { return scale_invar; }
36 
37  void BuildMatrix(const Vector<ValueType>& r_src, const Vector<ValueType>& n_src, const Vector<ValueType>& r_trg, Matrix<ValueType>& M) const {
38  Long Nsrc = r_src.Dim() / DIM;
39  Long Ntrg = r_trg.Dim() / DIM;
40  PVFMM_ASSERT(r_src.Dim() == Nsrc * DIM);
41  PVFMM_ASSERT(r_trg.Dim() == Ntrg * DIM);
42  if (M.Dim(0) != Nsrc * ker_dim[0] || M.Dim(1) != Ntrg * ker_dim[1]) {
43  M.ReInit(Nsrc * ker_dim[0], Ntrg * ker_dim[1]);
44  }
45 
46  Integer omp_p = omp_get_max_threads();
47  #pragma omp parallel for schedule(static)
48  for (Integer tid = 0; tid < omp_p; tid++) {
49  Vector<ValueType> r_src_, n_src_, v_src_(ker_dim[0]), v_trg;
50  Long a = Nsrc * (tid + 0) / omp_p;
51  Long b = Nsrc * (tid + 1) / omp_p;
52  for (Long i = a; i < b; i++) {
53  { // Set r_src_
54  r_src_.ReInit(DIM, (Iterator<ValueType>)r_src.Begin() + i * DIM, false);
55  }
56  if (n_src.Dim()) { // Set n_src_
57  n_src_.ReInit(DIM, (Iterator<ValueType>)n_src.Begin() + i * DIM, false);
58  }
59  for (Integer j = 0; j < ker_dim[0]; j++) {
60  { // Set v_src_
61  v_src_.SetZero();
62  v_src_[j] = 1;
63  }
64  { // Set v_trg
65  v_trg.ReInit(Ntrg * ker_dim[1], M[i * ker_dim[0] + j], false);
66  v_trg.SetZero();
67  }
68  (*this)(r_src_, n_src_, v_src_, r_trg, v_trg);
69  }
70  }
71  }
72  }
73 
74  const Permutation<ValueType>& SrcPerm(Integer idx) const {
75  PVFMM_ASSERT(idx < SymmTransCount);
76  return src_perm_vec[idx];
77  }
78 
79  const Permutation<ValueType>& TrgPerm(Integer idx) const {
80  PVFMM_ASSERT(idx < SymmTransCount);
81  return trg_perm_vec[idx];
82  }
83 
84  protected:
85  void Init(bool verbose) {
86  ValueType eps = 1.0;
87  while (eps + (ValueType)1.0 > 1.0) eps *= 0.5;
88 
89  Long N = 10240;
90  ValueType eps_ = N * N * eps;
91  Vector<ValueType> trg_coord(DIM);
92  Vector<ValueType> src_coord1(N * DIM);
93  Vector<ValueType> src_norml1(N * DIM);
94  Matrix<ValueType> M1(N, ker_dim[0] * ker_dim[1]);
95  { // Set trg_coord, src_coord1, src_norml1, M1
96  ValueType scal = 1.0;
97  trg_coord.SetZero();
98  while (true) {
99  ValueType abs_sum = 0;
100  for (Long i = 0; i < N; i++) {
101  ValueType x[DIM], r;
102  do {
103  r = 0;
104  for (Integer j = 0; j < DIM; j++) {
105  x[j] = (drand48() - 0.5);
106  r += x[j] * x[j];
107  }
108  r = pvfmm::sqrt<ValueType>(r);
109  } while (r < 0.25);
110  for (Integer j = 0; j < DIM; j++) {
111  src_coord1[i * DIM + j] = x[j] * scal;
112  }
113 
114  do {
115  r = 0;
116  for (Integer j = 0; j < DIM; j++) {
117  x[j] = (drand48() - 0.5);
118  r += x[j] * x[j];
119  }
120  r = pvfmm::sqrt<ValueType>(r);
121  } while (r < 0.25);
122  for (Integer j = 0; j < DIM; j++) {
123  src_norml1[i * DIM + j] = x[j] / r;
124  }
125  }
126  { // Set M1
127  M1.SetZero();
128  Matrix<ValueType> M(N * ker_dim[0], ker_dim[1], M1.Begin(), false);
129  BuildMatrix(src_coord1, src_norml1, trg_coord, M);
130  }
131  for (Long i = 0; i < M1.Dim(0) * M1.Dim(1); i++) {
132  abs_sum += pvfmm::fabs<ValueType>(M1[0][i]);
133  }
134  if (abs_sum > pvfmm::sqrt<ValueType>(eps) || scal < eps) break;
135  scal = scal * 0.5;
136  }
137  }
138 
139  if (ker_dim[0] * ker_dim[1] > 0) { // Determine scaling
140  Vector<ValueType> src_coord2(N * DIM);
141  Vector<ValueType> src_norml2(N * DIM);
142  Matrix<ValueType> M2(N, ker_dim[0] * ker_dim[1]);
143  for (Long i = 0; i < N * DIM; i++) { // Set src_coord1, src_norml1
144  src_coord2[i] = src_coord1[i] * 0.5;
145  src_norml2[i] = src_norml1[i];
146  }
147  { // Set M2
148  M2.SetZero();
149  Matrix<ValueType> M(N * ker_dim[0], ker_dim[1], M2.Begin(), false);
150  BuildMatrix(src_coord2, src_norml2, trg_coord, M);
151  }
152 
153  ValueType max_val = 0;
154  for (Long i = 0; i < ker_dim[0] * ker_dim[1]; i++) {
155  ValueType dot11 = 0, dot22 = 0;
156  for (Long j = 0; j < N; j++) {
157  dot11 += M1[j][i] * M1[j][i];
158  dot22 += M2[j][i] * M2[j][i];
159  }
160  max_val = std::max<ValueType>(max_val, dot11);
161  max_val = std::max<ValueType>(max_val, dot22);
162  }
163 
164  Matrix<ValueType> M_scal(ker_dim[0], ker_dim[1]);
165  if (scale_invar) { // Set M_scal[i][j] = log_2(s[i][j])
166  for (Long i = 0; i < ker_dim[0] * ker_dim[1]; i++) {
167  ValueType dot11 = 0, dot12 = 0, dot22 = 0;
168  for (Long j = 0; j < N; j++) {
169  dot11 += M1[j][i] * M1[j][i];
170  dot12 += M1[j][i] * M2[j][i];
171  dot22 += M2[j][i] * M2[j][i];
172  }
173  if (dot11 > max_val * eps && dot22 > max_val * eps) {
174  PVFMM_ASSERT(dot12 > 0);
175  ValueType s = dot12 / dot11;
176  M_scal[0][i] = pvfmm::log<ValueType>(s) / pvfmm::log<ValueType>(2.0);
177  ValueType err = pvfmm::sqrt<ValueType>((dot22 / dot11) / (s * s) - 1.0);
178  if (err > eps_) {
179  scale_invar = false;
180  M_scal[0][i] = 0.0;
181  }
182  // assert(M_scal[0][i]>=0.0); // Kernel function must decay
183  } else if (dot11 > max_val * eps || dot22 > max_val * eps) {
184  scale_invar = false;
185  M_scal[0][i] = 0.0;
186  } else {
187  M_scal[0][i] = -1;
188  }
189  }
190  }
191 
192  if (scale_invar) { // Set src_perm_vec[0], trg_perm_vec[0]
193  Matrix<ValueType> b(ker_dim[0] * ker_dim[1] + 1, 1);
194  { // Set b
195  memcopy(b.Begin(), M_scal.Begin(), ker_dim[0] * ker_dim[1]);
196  b[ker_dim[0] * ker_dim[1]][0] = 0;
197  }
198 
199  Matrix<ValueType> M(ker_dim[0] * ker_dim[1] + 1, ker_dim[0] + ker_dim[1]);
200  { // Set M
201  M.SetZero();
202  for (Long i0 = 0; i0 < ker_dim[0]; i0++) {
203  for (Long i1 = 0; i1 < ker_dim[1]; i1++) {
204  Long j = i0 * ker_dim[1] + i1;
205  if (b[j][0] > 0) {
206  M[j][i0 + 0] = 1;
207  M[j][i1 + ker_dim[0]] = 1;
208  }
209  }
210  }
211  M[ker_dim[0] * ker_dim[1]][0] = 1;
212  }
213 
214  Matrix<ValueType> x = M.pinv() * b;
215  for (Long i0 = 0; i0 < ker_dim[0]; i0++) { // Check solution
216  for (Long i1 = 0; i1 < ker_dim[1]; i1++) {
217  if (M_scal[i0][i1] >= 0) {
218  if (pvfmm::fabs<ValueType>(x[i0][0] + x[ker_dim[0] + i1][0] - M_scal[i0][i1]) > eps_) {
219  scale_invar = false;
220  }
221  }
222  }
223  }
224 
225  Permutation<ValueType> P_src(ker_dim[0]);
226  Permutation<ValueType> P_trg(ker_dim[1]);
227  if (scale_invar) { // Set P_src, P_trg (coarsen)
228  for (Long i = 0; i < ker_dim[0]; i++) {
229  P_src.scal[i] = pvfmm::pow<ValueType>(0.5, x[i][0]);
230  }
231  for (Long i = 0; i < ker_dim[1]; i++) {
232  P_trg.scal[i] = pvfmm::pow<ValueType>(0.5, x[ker_dim[0] + i][0]);
233  }
234  }
235  src_perm_vec[0] = P_src;
236  trg_perm_vec[0] = P_trg;
237  if (scale_invar) { // Set P_src, P_trg (refine)
238  for (Long i = 0; i < ker_dim[0]; i++) {
239  P_src.scal[i] = pvfmm::pow<ValueType>(0.5, -x[i][0]);
240  }
241  for (Long i = 0; i < ker_dim[1]; i++) {
242  P_trg.scal[i] = pvfmm::pow<ValueType>(0.5, -x[ker_dim[0] + i][0]);
243  }
244  }
245  src_perm_vec[1] = P_src;
246  trg_perm_vec[1] = P_trg;
247  }
248  }
249  if (ker_dim[0] * ker_dim[1] > 0) { // Determine symmetry
250  for (Integer p_type = 2; p_type < SymmTransCount; p_type++) { // For each symmetry transform
251  Vector<ValueType> src_coord2(N * DIM);
252  Vector<ValueType> src_norml2(N * DIM);
253  Matrix<ValueType> M2(N, ker_dim[0] * ker_dim[1]);
254  if (p_type < 2 + DIM) { // Reflect
255  for (Long i = 0; i < N; i++) {
256  for (Integer j = 0; j < DIM; j++) {
257  if (p_type == 2 + j) {
258  src_coord2[i * DIM + j] = -src_coord1[i * DIM + j];
259  src_norml2[i * DIM + j] = -src_norml1[i * DIM + j];
260  } else {
261  src_coord2[i * DIM + j] = src_coord1[i * DIM + j];
262  src_norml2[i * DIM + j] = src_norml1[i * DIM + j];
263  }
264  }
265  }
266  } else {
267  Integer swap0, swap1;
268  { // Set swap0, swap1
269  Integer iter = 2 + DIM;
270  for (Integer i = 0; i < DIM; i++) {
271  for (Integer j = i + 1; j < DIM; j++) {
272  if (iter == p_type) {
273  swap0 = i;
274  swap1 = j;
275  }
276  iter++;
277  }
278  }
279  }
280  for (Long i = 0; i < N; i++) {
281  for (Integer j = 0; j < DIM; j++) {
282  if (j == swap0) {
283  src_coord2[i * DIM + j] = src_coord1[i * DIM + swap1];
284  src_norml2[i * DIM + j] = src_norml1[i * DIM + swap1];
285  } else if (j == swap1) {
286  src_coord2[i * DIM + j] = src_coord1[i * DIM + swap0];
287  src_norml2[i * DIM + j] = src_norml1[i * DIM + swap0];
288  } else {
289  src_coord2[i * DIM + j] = src_coord1[i * DIM + j];
290  src_norml2[i * DIM + j] = src_norml1[i * DIM + j];
291  }
292  }
293  }
294  }
295  { // Set M2
296  M2.SetZero();
297  Matrix<ValueType> M(N * ker_dim[0], ker_dim[1], M2.Begin(), false);
298  BuildMatrix(src_coord2, src_norml2, trg_coord, M);
299  }
300 
301  Matrix<Long> M11, M22;
302  { // Set M11, M22
303  Matrix<ValueType> dot11(ker_dim[0] * ker_dim[1], ker_dim[0] * ker_dim[1]);
304  Matrix<ValueType> dot12(ker_dim[0] * ker_dim[1], ker_dim[0] * ker_dim[1]);
305  Matrix<ValueType> dot22(ker_dim[0] * ker_dim[1], ker_dim[0] * ker_dim[1]);
306  Vector<ValueType> norm1(ker_dim[0] * ker_dim[1]);
307  Vector<ValueType> norm2(ker_dim[0] * ker_dim[1]);
308  { // Set dot11, dot12, dot22, norm1, norm2
309  dot11.SetZero();
310  dot12.SetZero();
311  dot22.SetZero();
312  for (Long k = 0; k < N; k++) {
313  for (Long i = 0; i < ker_dim[0] * ker_dim[1]; i++) {
314  for (Long j = 0; j < ker_dim[0] * ker_dim[1]; j++) {
315  dot11[i][j] += M1[k][i] * M1[k][j];
316  dot12[i][j] += M1[k][i] * M2[k][j];
317  dot22[i][j] += M2[k][i] * M2[k][j];
318  }
319  }
320  }
321  for (Long i = 0; i < ker_dim[0] * ker_dim[1]; i++) {
322  norm1[i] = pvfmm::sqrt<ValueType>(dot11[i][i]);
323  norm2[i] = pvfmm::sqrt<ValueType>(dot22[i][i]);
324  }
325  for (Long i = 0; i < ker_dim[0] * ker_dim[1]; i++) {
326  for (Long j = 0; j < ker_dim[0] * ker_dim[1]; j++) {
327  dot11[i][j] /= (norm1[i] * norm1[j]);
328  dot12[i][j] /= (norm1[i] * norm2[j]);
329  dot22[i][j] /= (norm2[i] * norm2[j]);
330  }
331  }
332  }
333 
334  Long flag = 1;
335  M11.ReInit(ker_dim[0], ker_dim[1]);
336  M22.ReInit(ker_dim[0], ker_dim[1]);
337  M11.SetZero();
338  M22.SetZero();
339  for (Long i = 0; i < ker_dim[0] * ker_dim[1]; i++) {
340  if (norm1[i] > eps_ && M11[0][i] == 0) {
341  for (Long j = 0; j < ker_dim[0] * ker_dim[1]; j++) {
342  if (pvfmm::fabs<ValueType>(norm1[i] - norm1[j]) < eps_ && pvfmm::fabs<ValueType>(pvfmm::fabs<ValueType>(dot11[i][j]) - 1.0) < eps_) {
343  M11[0][j] = (dot11[i][j] > 0 ? flag : -flag);
344  }
345  if (pvfmm::fabs<ValueType>(norm1[i] - norm2[j]) < eps_ && pvfmm::fabs<ValueType>(pvfmm::fabs<ValueType>(dot12[i][j]) - 1.0) < eps_) {
346  M22[0][j] = (dot12[i][j] > 0 ? flag : -flag);
347  }
348  }
349  flag++;
350  }
351  }
352  }
353 
354  Matrix<Long> P1, P2;
355  { // P1
356  Matrix<Long>& P = P1;
357  Matrix<Long> M1 = M11;
358  Matrix<Long> M2 = M22;
359  for (Long i = 0; i < M1.Dim(0); i++) {
360  for (Long j = 0; j < M1.Dim(1); j++) {
361  if (M1[i][j] < 0) M1[i][j] = -M1[i][j];
362  if (M2[i][j] < 0) M2[i][j] = -M2[i][j];
363  }
364  std::sort(M1[i], M1[i] + M1.Dim(1));
365  std::sort(M2[i], M2[i] + M2.Dim(1));
366  }
367  P.ReInit(M1.Dim(0), M1.Dim(0));
368  for (Long i = 0; i < M1.Dim(0); i++) {
369  for (Long j = 0; j < M1.Dim(0); j++) {
370  P[i][j] = 1;
371  for (Long k = 0; k < M1.Dim(1); k++) {
372  if (M1[i][k] != M2[j][k]) {
373  P[i][j] = 0;
374  break;
375  }
376  }
377  }
378  }
379  }
380  { // P2
381  Matrix<Long>& P = P2;
382  Matrix<Long> M1 = M11.Transpose();
383  Matrix<Long> M2 = M22.Transpose();
384  for (Long i = 0; i < M1.Dim(0); i++) {
385  for (Long j = 0; j < M1.Dim(1); j++) {
386  if (M1[i][j] < 0) M1[i][j] = -M1[i][j];
387  if (M2[i][j] < 0) M2[i][j] = -M2[i][j];
388  }
389  std::sort(M1[i], M1[i] + M1.Dim(1));
390  std::sort(M2[i], M2[i] + M2.Dim(1));
391  }
392  P.ReInit(M1.Dim(0), M1.Dim(0));
393  for (Long i = 0; i < M1.Dim(0); i++) {
394  for (Long j = 0; j < M1.Dim(0); j++) {
395  P[i][j] = 1;
396  for (Long k = 0; k < M1.Dim(1); k++) {
397  if (M1[i][k] != M2[j][k]) {
398  P[i][j] = 0;
399  break;
400  }
401  }
402  }
403  }
404  }
405 
406  Vector<Permutation<Long>> P1vec, P2vec;
407  { // P1vec
408  Matrix<Long>& Pmat = P1;
409  Vector<Permutation<Long>>& Pvec = P1vec;
410 
411  Permutation<Long> P(Pmat.Dim(0));
412  Vector<Long>& perm = P.perm;
413  perm.SetZero();
414 
415  // First permutation
416  for (Long i = 0; i < P.Dim(); i++) {
417  for (Long j = 0; j < P.Dim(); j++) {
418  if (Pmat[i][j]) {
419  perm[i] = j;
420  break;
421  }
422  }
423  }
424 
425  Vector<Long> perm_tmp;
426  while (true) { // Next permutation
427  perm_tmp = perm;
428  std::sort(&perm_tmp[0], &perm_tmp[0] + perm_tmp.Dim());
429  for (Long i = 0; i < perm_tmp.Dim(); i++) {
430  if (perm_tmp[i] != i) break;
431  if (i == perm_tmp.Dim() - 1) {
432  Pvec.PushBack(P);
433  }
434  }
435 
436  bool last = false;
437  for (Long i = 0; i < P.Dim(); i++) {
438  Long tmp = perm[i];
439  for (Long j = perm[i] + 1; j < P.Dim(); j++) {
440  if (Pmat[i][j]) {
441  perm[i] = j;
442  break;
443  }
444  }
445  if (perm[i] > tmp) break;
446  for (Long j = 0; j < P.Dim(); j++) {
447  if (Pmat[i][j]) {
448  perm[i] = j;
449  break;
450  }
451  }
452  if (i == P.Dim() - 1) last = true;
453  }
454  if (last) break;
455  }
456  }
457  { // P2vec
458  Matrix<Long>& Pmat = P2;
459  Vector<Permutation<Long>>& Pvec = P2vec;
460 
461  Permutation<Long> P(Pmat.Dim(0));
462  Vector<Long>& perm = P.perm;
463  perm.SetZero();
464 
465  // First permutation
466  for (Long i = 0; i < P.Dim(); i++) {
467  for (Long j = 0; j < P.Dim(); j++) {
468  if (Pmat[i][j]) {
469  perm[i] = j;
470  break;
471  }
472  }
473  }
474 
475  Vector<Long> perm_tmp;
476  while (true) { // Next permutation
477  perm_tmp = perm;
478  std::sort(&perm_tmp[0], &perm_tmp[0] + perm_tmp.Dim());
479  for (Long i = 0; i < perm_tmp.Dim(); i++) {
480  if (perm_tmp[i] != i) break;
481  if (i == perm_tmp.Dim() - 1) {
482  Pvec.PushBack(P);
483  }
484  }
485 
486  bool last = false;
487  for (Long i = 0; i < P.Dim(); i++) {
488  Long tmp = perm[i];
489  for (Long j = perm[i] + 1; j < P.Dim(); j++) {
490  if (Pmat[i][j]) {
491  perm[i] = j;
492  break;
493  }
494  }
495  if (perm[i] > tmp) break;
496  for (Long j = 0; j < P.Dim(); j++) {
497  if (Pmat[i][j]) {
498  perm[i] = j;
499  break;
500  }
501  }
502  if (i == P.Dim() - 1) last = true;
503  }
504  if (last) break;
505  }
506  }
507 
508  { // Find pairs which acutally work (neglect scaling)
509  Vector<Permutation<Long>> P1vec_, P2vec_;
510  Matrix<Long> M1 = M11;
511  Matrix<Long> M2 = M22;
512  for (Long i = 0; i < M1.Dim(0); i++) {
513  for (Long j = 0; j < M1.Dim(1); j++) {
514  if (M1[i][j] < 0) M1[i][j] = -M1[i][j];
515  if (M2[i][j] < 0) M2[i][j] = -M2[i][j];
516  }
517  }
518 
519  Matrix<Long> M;
520  for (Long i = 0; i < P1vec.Dim(); i++) {
521  for (Long j = 0; j < P2vec.Dim(); j++) {
522  M = P1vec[i] * M2 * P2vec[j];
523  for (Long k = 0; k < M.Dim(0) * M.Dim(1); k++) {
524  if (M[0][k] != M1[0][k]) break;
525  if (k == M.Dim(0) * M.Dim(1) - 1) {
526  P1vec_.PushBack(P1vec[i]);
527  P2vec_.PushBack(P2vec[j]);
528  }
529  }
530  }
531  }
532 
533  P1vec = P1vec_;
534  P2vec = P2vec_;
535  }
536 
537  Permutation<ValueType> P1_, P2_;
538  { // Find pairs which acutally work
539  for (Long k = 0; k < P1vec.Dim(); k++) {
540  Permutation<Long> P1 = P1vec[k];
541  Permutation<Long> P2 = P2vec[k];
542  Matrix<Long> M1 = M11;
543  Matrix<Long> M2 = P1 * M22 * P2;
544 
545  Matrix<ValueType> M(M1.Dim(0) * M1.Dim(1) + 1, M1.Dim(0) + M1.Dim(1));
546  M.SetZero();
547  M[M1.Dim(0) * M1.Dim(1)][0] = 1.0;
548  for (Long i = 0; i < M1.Dim(0); i++) {
549  for (Long j = 0; j < M1.Dim(1); j++) {
550  Long k = i * M1.Dim(1) + j;
551  M[k][i] = M1[i][j];
552  M[k][M1.Dim(0) + j] = -M2[i][j];
553  }
554  }
555  M = M.pinv();
556  { // Construct new permutation
557  Permutation<Long> P1_(M1.Dim(0));
558  Permutation<Long> P2_(M1.Dim(1));
559  for (Long i = 0; i < M1.Dim(0); i++) {
560  P1_.scal[i] = (M[i][M1.Dim(0) * M1.Dim(1)] > 0 ? 1 : -1);
561  }
562  for (Long i = 0; i < M1.Dim(1); i++) {
563  P2_.scal[i] = (M[M1.Dim(0) + i][M1.Dim(0) * M1.Dim(1)] > 0 ? 1 : -1);
564  }
565  P1 = P1_ * P1;
566  P2 = P2 * P2_;
567  }
568 
569  bool done = true;
570  Matrix<Long> Merr = P1 * M22 * P2 - M11;
571  for (Long i = 0; i < Merr.Dim(0) * Merr.Dim(1); i++) {
572  if (Merr[0][i]) {
573  done = false;
574  break;
575  }
576  }
577  { // Check if permutation is symmetric
578  Permutation<Long> P1_ = P1.Transpose();
579  Permutation<Long> P2_ = P2.Transpose();
580  for (Long i = 0; i < P1.Dim(); i++) {
581  if (P1_.perm[i] != P1.perm[i] || P1_.scal[i] != P1.scal[i]) {
582  done = false;
583  break;
584  }
585  }
586  for (Long i = 0; i < P2.Dim(); i++) {
587  if (P2_.perm[i] != P2.perm[i] || P2_.scal[i] != P2.scal[i]) {
588  done = false;
589  break;
590  }
591  }
592  }
593  if (done) {
594  P1_ = Permutation<ValueType>(P1.Dim());
595  P2_ = Permutation<ValueType>(P2.Dim());
596  for (Long i = 0; i < P1.Dim(); i++) {
597  P1_.perm[i] = P1.perm[i];
598  P1_.scal[i] = P1.scal[i];
599  }
600  for (Long i = 0; i < P2.Dim(); i++) {
601  P2_.perm[i] = P2.perm[i];
602  P2_.scal[i] = P2.scal[i];
603  }
604  break;
605  }
606  }
607  assert(P1_.Dim() && P2_.Dim());
608  }
609  src_perm_vec[p_type] = P1_;
610  trg_perm_vec[p_type] = P2_;
611  }
612  for (Integer i = 2; i < SymmTransCount; i++) {
613  PVFMM_ASSERT_MSG(src_perm_vec[i].Dim() && trg_perm_vec[i].Dim(), "no-symmetry for: " << ker_name);
614  }
615  }
616  if (verbose) { // Display kernel information
617  std::cout << "\n";
618  std::cout << "Kernel Name : " << ker_name << '\n';
619  std::cout << "Precision : " << (double)eps << '\n';
620  std::cout << "Scale Invariant: " << (scale_invar ? "yes" : "no") << '\n';
621  if (scale_invar && ker_dim[0] * ker_dim[1] > 0) {
622  std::cout << "Scaling Matrix :\n";
623  Matrix<ValueType> Src(ker_dim[0], 1);
624  Matrix<ValueType> Trg(1, ker_dim[1]);
625  for (Long i = 0; i < ker_dim[0]; i++) Src[i][0] = 1.0 / src_perm_vec[0].scal[i];
626  for (Long i = 0; i < ker_dim[1]; i++) Trg[0][i] = 1.0 / trg_perm_vec[0].scal[i];
627  std::cout << Src* Trg;
628  }
629  std::cout << "\n";
630  }
631  PVFMM_ASSERT(scale_invar);
632  }
633 
634  void* ctx;
635  KerFn& ker;
636  std::string ker_name;
637  StaticArray<Integer, 2> ker_dim;
638 
639  bool scale_invar;
640  static const Integer SymmTransCount = 2 + DIM + DIM * (DIM - 1) / 2; // Scaling + Reflection + Coordinate-Swap
641  StaticArray<Permutation<ValueType>, SymmTransCount> src_perm_vec, trg_perm_vec;
642 };
643 
645  public:
650  template <class ValueType, class Real, class Vec, Integer DIM, Integer SRC_DIM, Integer TRG_DIM, void (*uKernel)(const Matrix<Real>&, const Matrix<Real>&, const Matrix<Real>&, const Matrix<Real>&, Matrix<Real>&)> static void kernel_wrapper(const Vector<ValueType>& r_src, const Vector<ValueType>& n_src, const Vector<ValueType>& v_src, const Vector<ValueType>& r_trg, Vector<ValueType>& v_trg) {
651  Integer VecLen = sizeof(Vec) / sizeof(Real);
652  Long src_cnt = r_src.Dim() / DIM;
653  Long trg_cnt = r_trg.Dim() / DIM;
654 
655  #define STACK_BUFF_SIZE 4096
656  StaticArray<Real, STACK_BUFF_SIZE + PVFMM_MEM_ALIGN> stack_buff;
657  Iterator<Real> buff = NULL;
658 
659  Matrix<Real> src_coord, src_norml, src_value, trg_coord, trg_value;
660  { // Rearrange data in src_coord, src_norml, src_value, trg_coord, trg_value
661  Long src_cnt_ = ((src_cnt + VecLen - 1) / VecLen) * VecLen;
662  Long trg_cnt_ = ((trg_cnt + VecLen - 1) / VecLen) * VecLen;
663 
664  Iterator<Real> buff_ptr = NULL;
665  { // Set buff_ptr
666  Long buff_size = 0;
667  if (r_src.Dim()) buff_size += src_cnt_ * DIM;
668  if (n_src.Dim()) buff_size += src_cnt_ * DIM;
669  if (v_src.Dim()) buff_size += src_cnt_ * SRC_DIM;
670  if (r_trg.Dim()) buff_size += trg_cnt_ * DIM;
671  if (v_trg.Dim()) buff_size += trg_cnt_ * TRG_DIM;
672  if (buff_size > STACK_BUFF_SIZE) { // Allocate buff
673  buff = aligned_new<Real>(buff_size);
674  buff_ptr = buff;
675  } else { // use stack_buff
676  const uintptr_t ptr = (uintptr_t) & stack_buff[0];
677  const uintptr_t ALIGN_MINUS_ONE = PVFMM_MEM_ALIGN - 1;
678  const uintptr_t NOT_ALIGN_MINUS_ONE = ~ALIGN_MINUS_ONE;
679  const uintptr_t offset = ((ptr + ALIGN_MINUS_ONE) & NOT_ALIGN_MINUS_ONE) - ptr;
680  buff_ptr = (Iterator<Real>)((Iterator<char>)stack_buff + offset);
681  }
682  }
683  if (r_src.Dim()) { // Set src_coord
684  src_coord.ReInit(DIM, src_cnt_, buff_ptr, false);
685  buff_ptr += DIM * src_cnt_;
686  Long i = 0;
687  for (; i < src_cnt; i++) {
688  for (Long j = 0; j < DIM; j++) {
689  src_coord[j][i] = r_src[i * DIM + j];
690  }
691  }
692  for (; i < src_cnt_; i++) {
693  for (Long j = 0; j < DIM; j++) {
694  src_coord[j][i] = 0;
695  }
696  }
697  }
698  if (n_src.Dim()) { // Set src_norml
699  src_norml.ReInit(DIM, src_cnt_, buff_ptr, false);
700  buff_ptr += DIM * src_cnt_;
701  Long i = 0;
702  for (; i < src_cnt; i++) {
703  for (Long j = 0; j < DIM; j++) {
704  src_norml[j][i] = n_src[i * DIM + j];
705  }
706  }
707  for (; i < src_cnt_; i++) {
708  for (Long j = 0; j < DIM; j++) {
709  src_norml[j][i] = 0;
710  }
711  }
712  }
713  if (v_src.Dim()) { // Set src_value
714  src_value.ReInit(SRC_DIM, src_cnt_, buff_ptr, false);
715  buff_ptr += SRC_DIM * src_cnt_;
716  Long i = 0;
717  for (; i < src_cnt; i++) {
718  for (Long j = 0; j < SRC_DIM; j++) {
719  src_value[j][i] = v_src[i * SRC_DIM + j];
720  }
721  }
722  for (; i < src_cnt_; i++) {
723  for (Long j = 0; j < SRC_DIM; j++) {
724  src_value[j][i] = 0;
725  }
726  }
727  }
728  if (r_trg.Dim()) { // Set trg_coord
729  trg_coord.ReInit(DIM, trg_cnt_, buff_ptr, false);
730  buff_ptr += DIM * trg_cnt_;
731  Long i = 0;
732  for (; i < trg_cnt; i++) {
733  for (Long j = 0; j < DIM; j++) {
734  trg_coord[j][i] = r_trg[i * DIM + j];
735  }
736  }
737  for (; i < trg_cnt_; i++) {
738  for (Long j = 0; j < DIM; j++) {
739  trg_coord[j][i] = 0;
740  }
741  }
742  }
743  if (v_trg.Dim()) { // Set trg_value
744  trg_value.ReInit(TRG_DIM, trg_cnt_, buff_ptr, false);
745  buff_ptr += TRG_DIM * trg_cnt_;
746  Long i = 0;
747  for (; i < trg_cnt_; i++) {
748  for (Long j = 0; j < TRG_DIM; j++) {
749  trg_value[j][i] = 0;
750  }
751  }
752  }
753  }
754  uKernel(src_coord, src_norml, src_value, trg_coord, trg_value);
755  { // Set v_trg
756  for (Long i = 0; i < trg_cnt; i++) {
757  for (Long j = 0; j < TRG_DIM; j++) {
758  v_trg[i * TRG_DIM + j] += trg_value[j][i];
759  }
760  }
761  }
762  if (buff != NULL) { // Free memory: buff
763  aligned_delete<Real>(buff);
764  }
765  }
766 };
767 
768 template <class ValueType> struct Laplace3D {
769  public:
770  static const Integer DIM = 3;
771 
772  inline static const KernelFunction<ValueType, DIM>& single_layer() {
773  static KernelFunction<ValueType, DIM> ker(sl_poten<2>, 1, 1, "laplace-sl");
774  return ker;
775  }
776 
777  inline static const KernelFunction<ValueType, DIM>& double_layer() {
778  static KernelFunction<ValueType, DIM> ker(dl_poten, 1, 1, "laplace-dl");
779  return ker;
780  }
781 
782  inline static const KernelFunction<ValueType, DIM>& single_layer_gradient() {
783  static KernelFunction<ValueType, DIM> ker(sl_grad, 1, DIM, "laplace-sl-grad");
784  return ker;
785  }
786 
787  inline static const KernelFunction<ValueType, DIM>& double_layer_gradient() {
788  static KernelFunction<ValueType, DIM> ker(dl_grad, 1, DIM, "laplace-dl-grad");
789  return ker;
790  }
791 
792  protected:
793  template <class Vec = ValueType, Vec (*RSQRT_INTRIN)(Vec) = rsqrt_intrin0<Vec>> static void sl_poten_uKernel(const Matrix<ValueType>& src_coord, const Matrix<ValueType>& src_norml, const Matrix<ValueType>& src_value, const Matrix<ValueType>& trg_coord, Matrix<ValueType>& trg_value) {
794  #define SRC_BLK 1000
795  Integer VecLen = sizeof(Vec) / sizeof(ValueType);
796 
798  Integer NWTN_ITER = 0;
799  if (RSQRT_INTRIN == (Vec (*)(Vec))rsqrt_intrin0<Vec, ValueType>) NWTN_ITER = 0;
800  if (RSQRT_INTRIN == (Vec (*)(Vec))rsqrt_intrin1<Vec, ValueType>) NWTN_ITER = 1;
801  if (RSQRT_INTRIN == (Vec (*)(Vec))rsqrt_intrin2<Vec, ValueType>) NWTN_ITER = 2;
802  if (RSQRT_INTRIN == (Vec (*)(Vec))rsqrt_intrin3<Vec, ValueType>) NWTN_ITER = 3;
803 
804  ValueType nwtn_scal = 1; // scaling factor for newton iterations
805  for (Integer i = 0; i < NWTN_ITER; i++) {
806  nwtn_scal = 2 * nwtn_scal * nwtn_scal * nwtn_scal;
807  }
808  const ValueType OOFP = 1.0 / (4 * nwtn_scal * const_pi<ValueType>());
809 
810  Long src_cnt_ = src_coord.Dim(1);
811  Long trg_cnt_ = trg_coord.Dim(1);
812  for (Long sblk = 0; sblk < src_cnt_; sblk += SRC_BLK) {
813  Long src_cnt = src_cnt_ - sblk;
814  if (src_cnt > SRC_BLK) src_cnt = SRC_BLK;
815  for (Long t = 0; t < trg_cnt_; t += VecLen) {
816  Vec tx = load_intrin<Vec>(&trg_coord[0][t]);
817  Vec ty = load_intrin<Vec>(&trg_coord[1][t]);
818  Vec tz = load_intrin<Vec>(&trg_coord[2][t]);
819  Vec tv = zero_intrin<Vec>();
820  for (Long s = sblk; s < sblk + src_cnt; s++) {
821  Vec dx = sub_intrin(tx, bcast_intrin<Vec>(&src_coord[0][s]));
822  Vec dy = sub_intrin(ty, bcast_intrin<Vec>(&src_coord[1][s]));
823  Vec dz = sub_intrin(tz, bcast_intrin<Vec>(&src_coord[2][s]));
824  Vec sv = bcast_intrin<Vec>(&src_value[0][s]);
825 
826  Vec r2 = mul_intrin(dx, dx);
827  r2 = add_intrin(r2, mul_intrin(dy, dy));
828  r2 = add_intrin(r2, mul_intrin(dz, dz));
829 
830  Vec rinv = RSQRT_INTRIN(r2);
831  tv = add_intrin(tv, mul_intrin(rinv, sv));
832  }
833  Vec oofp = set_intrin<Vec, ValueType>(OOFP);
834  tv = add_intrin(mul_intrin(tv, oofp), load_intrin<Vec>(&trg_value[0][t]));
835  store_intrin(&trg_value[0][t], tv);
836  }
837  }
838 
839  { // Add FLOPS
840  #ifndef __MIC__
841  //Profile::Add_FLOP((Long)trg_cnt_ * (Long)src_cnt_ * (12 + 4 * (NWTN_ITER)));
842  #endif
843  }
844  #undef SRC_BLK
845  }
846 
847  private:
848  template <Integer newton_iter = 0> static void sl_poten(const Vector<ValueType>& r_src, const Vector<ValueType>& n_src, const Vector<ValueType>& v_src, const Vector<ValueType>& r_trg, Vector<ValueType>& v_trg, const void* ctx) {
849  #define PVFMM_KER_NWTN(nwtn) \
850  if (newton_iter == nwtn) KernelFnWrapper::kernel_wrapper<ValueType, Real, Vec, DIM, 1, 1, Laplace3D<Real>::template sl_poten_uKernel<Vec, rsqrt_intrin##nwtn<Vec, Real>>>(r_src, n_src, v_src, r_trg, v_trg)
851  #define PVFMM_KERNEL_MACRO \
852  PVFMM_KER_NWTN(0); \
853  PVFMM_KER_NWTN(1); \
854  PVFMM_KER_NWTN(2); \
855  PVFMM_KER_NWTN(3);
857  typedef float Real;
858  #if defined __MIC__
859  #define Vec Real
860  #elif defined __AVX__
861  #define Vec __m256
862  #elif defined __SSE3__
863  #define Vec __m128
864  #else
865  #define Vec Real
866  #endif
867  PVFMM_KERNEL_MACRO;
868  #undef Vec
870  typedef double Real;
871  #if defined __MIC__
872  #define Vec Real
873  #elif defined __AVX__
874  #define Vec __m256d
875  #elif defined __SSE3__
876  #define Vec __m128d
877  #else
878  #define Vec Real
879  #endif
880  PVFMM_KERNEL_MACRO;
881  #undef Vec
882  } else {
883  typedef ValueType Real;
884  #define Vec Real
885  PVFMM_KERNEL_MACRO;
886  #undef Vec
887  }
888  #undef PVFMM_KER_NWTN
889  }
890 
891  static void dl_poten(const Vector<ValueType>& r_src, const Vector<ValueType>& n_src, const Vector<ValueType>& v_src, const Vector<ValueType>& r_trg, Vector<ValueType>& v_trg, const void* ctx) {
892  const ValueType OOFP = 1.0 / (4 * const_pi<ValueType>());
893  Long Nsrc = r_src.Dim() / DIM;
894  Long Ntrg = r_trg.Dim() / DIM;
895  PVFMM_ASSERT(r_src.Dim() == Nsrc * DIM);
896  PVFMM_ASSERT(n_src.Dim() == Nsrc * DIM);
897  PVFMM_ASSERT(v_src.Dim() == Nsrc * 1);
898  PVFMM_ASSERT(r_trg.Dim() == Ntrg * DIM);
899  PVFMM_ASSERT(v_trg.Dim() == Ntrg * 1);
900  for (Long i = 0; i < Ntrg; i++) {
901  ValueType trg_val = 0.0;
902  for (Long j = 0; j < Nsrc; j++) {
903  ValueType dx[DIM];
904  for (Integer k = 0; k < DIM; k++) {
905  dx[k] = r_trg[i * DIM + k] - r_src[j * DIM + k];
906  }
907 
908  ValueType n_dot_r = 0;
909  for (Integer k = 0; k < DIM; k++) {
910  n_dot_r += n_src[j * DIM + k] * dx[k];
911  }
912 
913  ValueType r2 = 0;
914  for (Integer k = 0; k < DIM; k++) {
915  r2 += dx[k] * dx[k];
916  }
917  ValueType r2inv = (r2 ? 1.0 / r2 : 0.0);
918  ValueType rinv = sqrt(r2inv);
919 
920  trg_val += v_src[j] * n_dot_r * r2inv * rinv;
921  }
922  v_trg[i] += trg_val * OOFP;
923  }
924  //Profile::Add_FLOP((Long)Ntrg * (Long)Nsrc * 19);
925  }
926 
927  static void sl_grad(const Vector<ValueType>& r_src, const Vector<ValueType>& n_src, const Vector<ValueType>& v_src, const Vector<ValueType>& r_trg, Vector<ValueType>& v_trg, const void* ctx) {
928  Long Nsrc = r_src.Dim() / DIM;
929  Long Ntrg = r_trg.Dim() / DIM;
930  PVFMM_ASSERT(r_src.Dim() == Nsrc * DIM);
931  PVFMM_ASSERT(v_src.Dim() == Nsrc * 1);
932  PVFMM_ASSERT(r_trg.Dim() == Ntrg * DIM);
933  PVFMM_ASSERT(v_trg.Dim() == Ntrg * DIM);
934  for (Long i = 0; i < Ntrg; i++) {
935  ValueType trg_val[DIM];
936  for (Integer k = 0; k < DIM; k++) {
937  trg_val[k] = 0;
938  }
939  for (Long j = 0; j < Nsrc; j++) {
940  ValueType dx[DIM];
941  for (Integer k = 0; k < DIM; k++) {
942  dx[k] = r_trg[i * DIM + k] - r_src[j * DIM + k];
943  }
944 
945  ValueType r2 = 0;
946  for (Integer k = 0; k < DIM; k++) {
947  r2 += dx[k] * dx[k];
948  }
949  ValueType r2inv = (r2 ? 1.0 / r2 : 0.0);
950  ValueType rinv = sqrt(r2inv);
951 
952  ValueType v_r3inv = v_src[j] * r2inv * rinv;
953  for (Integer k = 0; k < DIM; k++) {
954  trg_val[k] += dx[k] * v_r3inv;
955  }
956  }
957  for (Integer k = 0; k < DIM; k++) {
958  v_trg[i * DIM + k] += trg_val[k];
959  }
960  }
961  //Profile::Add_FLOP((Long)Ntrg * (Long)Nsrc * 18);
962  }
963 
964  static void dl_grad(const Vector<ValueType>& r_src, const Vector<ValueType>& n_src, const Vector<ValueType>& v_src, const Vector<ValueType>& r_trg, Vector<ValueType>& v_trg, const void* ctx) {
965  Long Nsrc = r_src.Dim() / DIM;
966  Long Ntrg = r_trg.Dim() / DIM;
967  PVFMM_ASSERT(r_src.Dim() == Nsrc * DIM);
968  PVFMM_ASSERT(n_src.Dim() == Nsrc * DIM);
969  PVFMM_ASSERT(v_src.Dim() == Nsrc * 1);
970  PVFMM_ASSERT(r_trg.Dim() == Ntrg * DIM);
971  PVFMM_ASSERT(v_trg.Dim() == Ntrg * DIM);
972  for (Long i = 0; i < Ntrg; i++) {
973  ValueType trg_val[DIM];
974  for (Integer k = 0; k < DIM; k++) {
975  trg_val[k] = 0;
976  }
977  for (Long j = 0; j < Nsrc; j++) {
978  ValueType dx[DIM];
979  for (Integer k = 0; k < DIM; k++) {
980  dx[k] = r_trg[i * DIM + k] - r_src[j * DIM + k];
981  }
982 
983  ValueType n_dot_r = 0;
984  for (Integer k = 0; k < DIM; k++) {
985  n_dot_r += n_src[j * DIM + k] * dx[k];
986  }
987 
988  ValueType r2 = 0;
989  for (Integer k = 0; k < DIM; k++) {
990  r2 += dx[k] * dx[k];
991  }
992  ValueType r2inv = (r2 ? 1.0 / r2 : 0.0);
993  ValueType rinv = sqrt(r2inv);
994 
995  ValueType v_nr_r5inv = v_src[j] * r2inv * r2inv * rinv * n_dot_r;
996  for (Integer k = 0; k < DIM; k++) {
997  trg_val[k] += dx[k] * v_nr_r5inv;
998  }
999  }
1000  for (Integer k = 0; k < DIM; k++) {
1001  v_trg[i * DIM + k] += trg_val[k];
1002  }
1003  }
1004  //Profile::Add_FLOP((Long)Ntrg * (Long)Nsrc * 25);
1005  }
1006 
1007  friend class KernelFnWrapper;
1008 };
1009 
1010 template <class ValueType> struct Stokes3D {
1011  public:
1012  static const Integer DIM = 3;
1013 
1014  inline static const KernelFunction<ValueType, DIM>& FxP() {
1015  constexpr Integer k_dim0 = DIM;
1016  constexpr Integer k_dim1 = 1;
1017  static const ValueType scal = 1.0/(8.0*const_pi<ValueType>());
1018  static auto micro_ker = [&](ValueType* v, const ValueType* x, ValueType invr, const ValueType* n, const ValueType* f) {
1019  ValueType fdotr = 0;
1020  ValueType invr2 = invr*invr;
1021  ValueType invr3 = invr2*invr;
1022  ValueType invr5 = invr2*invr3;
1023  ValueType invr7 = invr2*invr5;
1024  for(Integer k=0;k<DIM;k++) fdotr += x[k] * f[k];
1025 
1026  v[0] += (2*invr3*fdotr) * scal;
1027  };
1028  return KernelFromLambda<k_dim0, k_dim1>(__FUNCTION__, micro_ker);
1029  }
1030 
1031  inline static const KernelFunction<ValueType, DIM>& FxdP() {
1032  constexpr Integer k_dim0 = DIM;
1033  constexpr Integer k_dim1 = DIM;
1034  static const ValueType scal = 1.0/(8.0*const_pi<ValueType>());
1035  static auto micro_ker = [&](ValueType* v, const ValueType* x, ValueType invr, const ValueType* n, const ValueType* f) {
1036  ValueType fdotr = 0;
1037  ValueType invr2 = invr*invr;
1038  ValueType invr3 = invr2*invr;
1039  ValueType invr5 = invr2*invr3;
1040  ValueType invr7 = invr2*invr5;
1041  for(Integer k=0;k<DIM;k++) fdotr += x[k] * f[k];
1042 
1043  v[0] += (2*invr3*f[0]-6*x[0]*invr5*fdotr) * scal;
1044  v[1] += (2*invr3*f[1]-6*x[1]*invr5*fdotr) * scal;
1045  v[2] += (2*invr3*f[2]-6*x[2]*invr5*fdotr) * scal;
1046  };
1047  return KernelFromLambda<k_dim0, k_dim1>(__FUNCTION__, micro_ker);
1048  }
1049 
1050  inline static const KernelFunction<ValueType, DIM>& FxU() {
1051  constexpr Integer k_dim0 = DIM;
1052  constexpr Integer k_dim1 = DIM;
1053  static const ValueType scal = 1.0/(8.0*const_pi<ValueType>());
1054  static auto micro_ker = [&](ValueType* v, const ValueType* x, ValueType invr, const ValueType* n, const ValueType* f) {
1055  ValueType fdotr = 0;
1056  ValueType invr2 = invr*invr;
1057  ValueType invr3 = invr2*invr;
1058  ValueType invr5 = invr2*invr3;
1059  ValueType invr7 = invr2*invr5;
1060  for(Integer k=0;k<DIM;k++) fdotr += x[k] * f[k];
1061 
1062  v[0] += (x[0]*invr3*fdotr+f[0]*invr) * scal;
1063  v[1] += (x[1]*invr3*fdotr+f[1]*invr) * scal;
1064  v[2] += (x[2]*invr3*fdotr+f[2]*invr) * scal;
1065  };
1066  return KernelFromLambda<k_dim0, k_dim1>(__FUNCTION__, micro_ker);
1067  }
1068 
1069  inline static const KernelFunction<ValueType, DIM>& FxdU() {
1070  constexpr Integer k_dim0 = DIM;
1071  constexpr Integer k_dim1 = DIM * DIM;
1072  static const ValueType scal = 1.0/(8.0*const_pi<ValueType>());
1073  static auto micro_ker = [&](ValueType* v, const ValueType* x, ValueType invr, const ValueType* n, const ValueType* f) {
1074  ValueType fdotr = 0;
1075  ValueType invr2 = invr*invr;
1076  ValueType invr3 = invr2*invr;
1077  ValueType invr5 = invr2*invr3;
1078  ValueType invr7 = invr2*invr5;
1079  for(Integer k=0;k<DIM;k++) fdotr += x[k] * f[k];
1080 
1081  v[0] += (invr3*fdotr-3*x[0]*x[0]*invr5*fdotr ) * scal;
1082  v[1] += ((-3*x[0]*x[1]*invr5*fdotr)+x[0]*invr3*f[1]-x[1]*invr3*f[0]) * scal;
1083  v[2] += ((-3*x[0]*x[2]*invr5*fdotr)+x[0]*invr3*f[2]-x[2]*invr3*f[0]) * scal;
1084  v[3] += ((-3*x[0]*x[1]*invr5*fdotr)-x[0]*invr3*f[1]+x[1]*invr3*f[0]) * scal;
1085  v[4] += (invr3*fdotr-3*x[1]*x[1]*invr5*fdotr ) * scal;
1086  v[5] += ((-3*x[1]*x[2]*invr5*fdotr)+x[1]*invr3*f[2]-x[2]*invr3*f[1]) * scal;
1087  v[6] += ((-3*x[0]*x[2]*invr5*fdotr)-x[0]*invr3*f[2]+x[2]*invr3*f[0]) * scal;
1088  v[7] += ((-3*x[1]*x[2]*invr5*fdotr)-x[1]*invr3*f[2]+x[2]*invr3*f[1]) * scal;
1089  v[8] += (invr3*fdotr-3*x[2]*x[2]*invr5*fdotr ) * scal;
1090  };
1091  return KernelFromLambda<k_dim0, k_dim1>(__FUNCTION__, micro_ker);
1092  }
1093 
1094  inline static const KernelFunction<ValueType, DIM>& Fxd2U() {
1095  constexpr Integer k_dim0 = DIM;
1096  constexpr Integer k_dim1 = DIM * DIM * DIM;
1097  static const ValueType scal = 1.0/(8.0*const_pi<ValueType>());
1098  static auto micro_ker = [&](ValueType* v, const ValueType* x, ValueType invr, const ValueType* n, const ValueType* f) {
1099  ValueType fdotr = 0;
1100  ValueType invr2 = invr*invr;
1101  ValueType invr3 = invr2*invr;
1102  ValueType invr5 = invr2*invr3;
1103  ValueType invr7 = invr2*invr5;
1104  for(Integer k=0;k<DIM;k++) fdotr += x[k] * f[k];
1105 
1106  v[ 0] += ((-9*x[0]*invr5*fdotr)+15*x[0]*x[0]*x[0]*invr7*fdotr+invr3*f[0]-3*x[0]*x[0]*invr5*f[0] ) * scal;
1107  v[ 1] += ((-3*x[1]*invr5*fdotr)+15*x[0]*x[0]*x[1]*invr7*fdotr+invr3*f[1]-3*x[0]*x[0]*invr5*f[1] ) * scal;
1108  v[ 2] += ((-3*x[2]*invr5*fdotr)+15*x[0]*x[0]*x[2]*invr7*fdotr+invr3*f[2]-3*x[0]*x[0]*invr5*f[2] ) * scal;
1109  v[ 3] += ((-3*x[1]*invr5*fdotr)+15*x[0]*x[0]*x[1]*invr7*fdotr+invr3*f[1]-3*x[0]*x[0]*invr5*f[1] ) * scal;
1110  v[ 4] += ((-3*x[0]*invr5*fdotr)+15*x[0]*x[1]*x[1]*invr7*fdotr-6*x[0]*x[1]*invr5*f[1]-invr3*f[0]+3*x[1]*x[1]*invr5*f[0]) * scal;
1111  v[ 5] += (15*x[0]*x[1]*x[2]*invr7*fdotr-3*x[0]*x[1]*invr5*f[2]-3*x[0]*x[2]*invr5*f[1]+3*x[1]*x[2]*invr5*f[0] ) * scal;
1112  v[ 6] += ((-3*x[2]*invr5*fdotr)+15*x[0]*x[0]*x[2]*invr7*fdotr+invr3*f[2]-3*x[0]*x[0]*invr5*f[2] ) * scal;
1113  v[ 7] += (15*x[0]*x[1]*x[2]*invr7*fdotr-3*x[0]*x[1]*invr5*f[2]-3*x[0]*x[2]*invr5*f[1]+3*x[1]*x[2]*invr5*f[0] ) * scal;
1114  v[ 8] += ((-3*x[0]*invr5*fdotr)+15*x[0]*x[2]*x[2]*invr7*fdotr-6*x[0]*x[2]*invr5*f[2]-invr3*f[0]+3*x[2]*x[2]*invr5*f[0]) * scal;
1115  v[ 9] += ((-3*x[1]*invr5*fdotr)+15*x[0]*x[0]*x[1]*invr7*fdotr-invr3*f[1]+3*x[0]*x[0]*invr5*f[1]-6*x[0]*x[1]*invr5*f[0]) * scal;
1116  v[10] += ((-3*x[0]*invr5*fdotr)+15*x[0]*x[1]*x[1]*invr7*fdotr+invr3*f[0]-3*x[1]*x[1]*invr5*f[0] ) * scal;
1117  v[11] += (15*x[0]*x[1]*x[2]*invr7*fdotr-3*x[0]*x[1]*invr5*f[2]+3*x[0]*x[2]*invr5*f[1]-3*x[1]*x[2]*invr5*f[0] ) * scal;
1118  v[12] += ((-3*x[0]*invr5*fdotr)+15*x[0]*x[1]*x[1]*invr7*fdotr+invr3*f[0]-3*x[1]*x[1]*invr5*f[0] ) * scal;
1119  v[13] += ((-9*x[1]*invr5*fdotr)+15*x[1]*x[1]*x[1]*invr7*fdotr+invr3*f[1]-3*x[1]*x[1]*invr5*f[1] ) * scal;
1120  v[14] += ((-3*x[2]*invr5*fdotr)+15*x[1]*x[1]*x[2]*invr7*fdotr+invr3*f[2]-3*x[1]*x[1]*invr5*f[2] ) * scal;
1121  v[15] += (15*x[0]*x[1]*x[2]*invr7*fdotr-3*x[0]*x[1]*invr5*f[2]+3*x[0]*x[2]*invr5*f[1]-3*x[1]*x[2]*invr5*f[0] ) * scal;
1122  v[16] += ((-3*x[2]*invr5*fdotr)+15*x[1]*x[1]*x[2]*invr7*fdotr+invr3*f[2]-3*x[1]*x[1]*invr5*f[2] ) * scal;
1123  v[17] += ((-3*x[1]*invr5*fdotr)+15*x[1]*x[2]*x[2]*invr7*fdotr-6*x[1]*x[2]*invr5*f[2]-invr3*f[1]+3*x[2]*x[2]*invr5*f[1]) * scal;
1124  v[18] += ((-3*x[2]*invr5*fdotr)+15*x[0]*x[0]*x[2]*invr7*fdotr-invr3*f[2]+3*x[0]*x[0]*invr5*f[2]-6*x[0]*x[2]*invr5*f[0]) * scal;
1125  v[19] += (15*x[0]*x[1]*x[2]*invr7*fdotr+3*x[0]*x[1]*invr5*f[2]-3*x[0]*x[2]*invr5*f[1]-3*x[1]*x[2]*invr5*f[0] ) * scal;
1126  v[20] += ((-3*x[0]*invr5*fdotr)+15*x[0]*x[2]*x[2]*invr7*fdotr+invr3*f[0]-3*x[2]*x[2]*invr5*f[0] ) * scal;
1127  v[21] += (15*x[0]*x[1]*x[2]*invr7*fdotr+3*x[0]*x[1]*invr5*f[2]-3*x[0]*x[2]*invr5*f[1]-3*x[1]*x[2]*invr5*f[0] ) * scal;
1128  v[22] += ((-3*x[2]*invr5*fdotr)+15*x[1]*x[1]*x[2]*invr7*fdotr-invr3*f[2]+3*x[1]*x[1]*invr5*f[2]-6*x[1]*x[2]*invr5*f[1]) * scal;
1129  v[23] += ((-3*x[1]*invr5*fdotr)+15*x[1]*x[2]*x[2]*invr7*fdotr+invr3*f[1]-3*x[2]*x[2]*invr5*f[1] ) * scal;
1130  v[24] += ((-3*x[0]*invr5*fdotr)+15*x[0]*x[2]*x[2]*invr7*fdotr+invr3*f[0]-3*x[2]*x[2]*invr5*f[0] ) * scal;
1131  v[25] += ((-3*x[1]*invr5*fdotr)+15*x[1]*x[2]*x[2]*invr7*fdotr+invr3*f[1]-3*x[2]*x[2]*invr5*f[1] ) * scal;
1132  v[26] += ((-9*x[2]*invr5*fdotr)+15*x[2]*x[2]*x[2]*invr7*fdotr+invr3*f[2]-3*x[2]*x[2]*invr5*f[2] ) * scal;
1133  };
1134  return KernelFromLambda<k_dim0, k_dim1>(__FUNCTION__, micro_ker);
1135  }
1136 
1137  inline static const KernelFunction<ValueType, DIM>& FxPU() {
1138  constexpr Integer k_dim0 = DIM;
1139  constexpr Integer k_dim1 = DIM+1;
1140  static const ValueType scal = 1.0/(8.0*const_pi<ValueType>());
1141  static auto micro_ker = [&](ValueType* v, const ValueType* x, ValueType invr, const ValueType* n, const ValueType* f) {
1142  ValueType fdotr = 0;
1143  ValueType invr2 = invr*invr;
1144  ValueType invr3 = invr2*invr;
1145  ValueType invr5 = invr2*invr3;
1146  ValueType invr7 = invr2*invr5;
1147  for(Integer k=0;k<DIM;k++) fdotr += x[k] * f[k];
1148 
1149  v[0] += (2*invr3*fdotr ) * scal;
1150  v[1] += (x[0]*invr3*fdotr+f[0]*invr) * scal;
1151  v[2] += (x[1]*invr3*fdotr+f[1]*invr) * scal;
1152  v[3] += (x[2]*invr3*fdotr+f[2]*invr) * scal;
1153  };
1154  return KernelFromLambda<k_dim0, k_dim1>(__FUNCTION__, micro_ker);
1155  }
1156 
1158 
1159  inline static const KernelFunction<ValueType, DIM>& FSxP() {
1160  constexpr Integer k_dim0 = DIM+1;
1161  constexpr Integer k_dim1 = 1;
1162  static const ValueType scal = 1.0/(8.0*const_pi<ValueType>());
1163  static auto micro_ker = [&](ValueType* v, const ValueType* x, ValueType invr, const ValueType* n, const ValueType* f) {
1164  ValueType fdotr = 0;
1165  ValueType invr2 = invr*invr;
1166  ValueType invr3 = invr2*invr;
1167  ValueType invr5 = invr2*invr3;
1168  ValueType invr7 = invr2*invr5;
1169  for(Integer k=0;k<DIM;k++) fdotr += x[k] * f[k];
1170 
1171  v[0] += (2*invr3*fdotr) * scal;
1172  };
1173  return KernelFromLambda<k_dim0, k_dim1>(__FUNCTION__, micro_ker);
1174  }
1175 
1176  inline static const KernelFunction<ValueType, DIM>& FSxdP() {
1177  constexpr Integer k_dim0 = DIM+1;
1178  constexpr Integer k_dim1 = DIM;
1179  static const ValueType scal = 1.0/(8.0*const_pi<ValueType>());
1180  static auto micro_ker = [&](ValueType* v, const ValueType* x, ValueType invr, const ValueType* n, const ValueType* f) {
1181  ValueType fdotr = 0;
1182  ValueType invr2 = invr*invr;
1183  ValueType invr3 = invr2*invr;
1184  ValueType invr5 = invr2*invr3;
1185  ValueType invr7 = invr2*invr5;
1186  for(Integer k=0;k<DIM;k++) fdotr += x[k] * f[k];
1187 
1188  v[0] += (2*invr3*f[0]-6*x[0]*invr5*fdotr) * scal;
1189  v[1] += (2*invr3*f[1]-6*x[1]*invr5*fdotr) * scal;
1190  v[2] += (2*invr3*f[2]-6*x[2]*invr5*fdotr) * scal;
1191  };
1192  return KernelFromLambda<k_dim0, k_dim1>(__FUNCTION__, micro_ker);
1193  }
1194 
1195  inline static const KernelFunction<ValueType, DIM>& FSxU() {
1196  constexpr Integer k_dim0 = DIM+1;
1197  constexpr Integer k_dim1 = DIM;
1198  static const ValueType scal = 1.0/(8.0*const_pi<ValueType>());
1199  static auto micro_ker = [&](ValueType* v, const ValueType* x, ValueType invr, const ValueType* n, const ValueType* f) {
1200  ValueType fdotr = 0;
1201  ValueType invr2 = invr*invr;
1202  ValueType invr3 = invr2*invr;
1203  ValueType invr5 = invr2*invr3;
1204  ValueType invr7 = invr2*invr5;
1205  for(Integer k=0;k<DIM;k++) fdotr += x[k] * f[k];
1206 
1207  v[0] += (x[0]*invr3*f[3]+x[0]*invr3*fdotr+f[0]*invr) * scal;
1208  v[1] += (x[1]*invr3*f[3]+x[1]*invr3*fdotr+f[1]*invr) * scal;
1209  v[2] += (x[2]*invr3*f[3]+x[2]*invr3*fdotr+f[2]*invr) * scal;
1210  };
1211  return KernelFromLambda<k_dim0, k_dim1>(__FUNCTION__, micro_ker);
1212  }
1213 
1214  inline static const KernelFunction<ValueType, DIM>& FSxdU() {
1215  constexpr Integer k_dim0 = DIM+1;
1216  constexpr Integer k_dim1 = DIM * DIM;
1217  static const ValueType scal = 1.0/(8.0*const_pi<ValueType>());
1218  static auto micro_ker = [&](ValueType* v, const ValueType* x, ValueType invr, const ValueType* n, const ValueType* f) {
1219  ValueType fdotr = 0;
1220  ValueType invr2 = invr*invr;
1221  ValueType invr3 = invr2*invr;
1222  ValueType invr5 = invr2*invr3;
1223  ValueType invr7 = invr2*invr5;
1224  for(Integer k=0;k<DIM;k++) fdotr += x[k] * f[k];
1225 
1226  v[0] += (invr3*f[3]-3*x[0]*x[0]*invr5*f[3]+invr3*fdotr-3*x[0]*x[0]*invr5*fdotr ) * scal;
1227  v[1] += ((-3*x[0]*x[1]*invr5*f[3])-3*x[0]*x[1]*invr5*fdotr+x[0]*invr3*f[1]-x[1]*invr3*f[0]) * scal;
1228  v[2] += ((-3*x[0]*x[2]*invr5*f[3])-3*x[0]*x[2]*invr5*fdotr+x[0]*invr3*f[2]-x[2]*invr3*f[0]) * scal;
1229  v[3] += ((-3*x[0]*x[1]*invr5*f[3])-3*x[0]*x[1]*invr5*fdotr-x[0]*invr3*f[1]+x[1]*invr3*f[0]) * scal;
1230  v[4] += (invr3*f[3]-3*x[1]*x[1]*invr5*f[3]+invr3*fdotr-3*x[1]*x[1]*invr5*fdotr ) * scal;
1231  v[5] += ((-3*x[1]*x[2]*invr5*f[3])-3*x[1]*x[2]*invr5*fdotr+x[1]*invr3*f[2]-x[2]*invr3*f[1]) * scal;
1232  v[6] += ((-3*x[0]*x[2]*invr5*f[3])-3*x[0]*x[2]*invr5*fdotr-x[0]*invr3*f[2]+x[2]*invr3*f[0]) * scal;
1233  v[7] += ((-3*x[1]*x[2]*invr5*f[3])-3*x[1]*x[2]*invr5*fdotr-x[1]*invr3*f[2]+x[2]*invr3*f[1]) * scal;
1234  v[8] += (invr3*f[3]-3*x[2]*x[2]*invr5*f[3]+invr3*fdotr-3*x[2]*x[2]*invr5*fdotr ) * scal;
1235  };
1236  return KernelFromLambda<k_dim0, k_dim1>(__FUNCTION__, micro_ker);
1237  }
1238 
1239  inline static const KernelFunction<ValueType, DIM>& FSxd2U() {
1240  constexpr Integer k_dim0 = DIM+1;
1241  constexpr Integer k_dim1 = DIM * DIM * DIM;
1242  static const ValueType scal = 1.0/(8.0*const_pi<ValueType>());
1243  static auto micro_ker = [&](ValueType* v, const ValueType* x, ValueType invr, const ValueType* n, const ValueType* f) {
1244  ValueType fdotr = 0;
1245  ValueType invr2 = invr*invr;
1246  ValueType invr3 = invr2*invr;
1247  ValueType invr5 = invr2*invr3;
1248  ValueType invr7 = invr2*invr5;
1249  for(Integer k=0;k<DIM;k++) fdotr += x[k] * f[k];
1250 
1251  v[ 0] += ((-9*x[0]*invr5*f[3])+15*x[0]*x[0]*x[0]*invr7*f[3]-9*x[0]*invr5*fdotr+15*x[0]*x[0]*x[0]*invr7*fdotr+invr3*f[0]-3*x[0]*x[0]*invr5*f[0] ) * scal;
1252  v[ 1] += ((-3*x[1]*invr5*f[3])+15*x[0]*x[0]*x[1]*invr7*f[3]-3*x[1]*invr5*fdotr+15*x[0]*x[0]*x[1]*invr7*fdotr+invr3*f[1]-3*x[0]*x[0]*invr5*f[1] ) * scal;
1253  v[ 2] += ((-3*x[2]*invr5*f[3])+15*x[0]*x[0]*x[2]*invr7*f[3]-3*x[2]*invr5*fdotr+15*x[0]*x[0]*x[2]*invr7*fdotr+invr3*f[2]-3*x[0]*x[0]*invr5*f[2] ) * scal;
1254  v[ 3] += ((-3*x[1]*invr5*f[3])+15*x[0]*x[0]*x[1]*invr7*f[3]-3*x[1]*invr5*fdotr+15*x[0]*x[0]*x[1]*invr7*fdotr+invr3*f[1]-3*x[0]*x[0]*invr5*f[1] ) * scal;
1255  v[ 4] += ((-3*x[0]*invr5*f[3])+15*x[0]*x[1]*x[1]*invr7*f[3]-3*x[0]*invr5*fdotr+15*x[0]*x[1]*x[1]*invr7*fdotr-6*x[0]*x[1]*invr5*f[1]-invr3*f[0]+3*x[1]*x[1]*invr5*f[0]) * scal;
1256  v[ 5] += (15*x[0]*x[1]*x[2]*invr7*f[3]+15*x[0]*x[1]*x[2]*invr7*fdotr-3*x[0]*x[1]*invr5*f[2]-3*x[0]*x[2]*invr5*f[1]+3*x[1]*x[2]*invr5*f[0] ) * scal;
1257  v[ 6] += ((-3*x[2]*invr5*f[3])+15*x[0]*x[0]*x[2]*invr7*f[3]-3*x[2]*invr5*fdotr+15*x[0]*x[0]*x[2]*invr7*fdotr+invr3*f[2]-3*x[0]*x[0]*invr5*f[2] ) * scal;
1258  v[ 7] += (15*x[0]*x[1]*x[2]*invr7*f[3]+15*x[0]*x[1]*x[2]*invr7*fdotr-3*x[0]*x[1]*invr5*f[2]-3*x[0]*x[2]*invr5*f[1]+3*x[1]*x[2]*invr5*f[0] ) * scal;
1259  v[ 8] += ((-3*x[0]*invr5*f[3])+15*x[0]*x[2]*x[2]*invr7*f[3]-3*x[0]*invr5*fdotr+15*x[0]*x[2]*x[2]*invr7*fdotr-6*x[0]*x[2]*invr5*f[2]-invr3*f[0]+3*x[2]*x[2]*invr5*f[0]) * scal;
1260  v[ 9] += ((-3*x[1]*invr5*f[3])+15*x[0]*x[0]*x[1]*invr7*f[3]-3*x[1]*invr5*fdotr+15*x[0]*x[0]*x[1]*invr7*fdotr-invr3*f[1]+3*x[0]*x[0]*invr5*f[1]-6*x[0]*x[1]*invr5*f[0]) * scal;
1261  v[10] += ((-3*x[0]*invr5*f[3])+15*x[0]*x[1]*x[1]*invr7*f[3]-3*x[0]*invr5*fdotr+15*x[0]*x[1]*x[1]*invr7*fdotr+invr3*f[0]-3*x[1]*x[1]*invr5*f[0] ) * scal;
1262  v[11] += (15*x[0]*x[1]*x[2]*invr7*f[3]+15*x[0]*x[1]*x[2]*invr7*fdotr-3*x[0]*x[1]*invr5*f[2]+3*x[0]*x[2]*invr5*f[1]-3*x[1]*x[2]*invr5*f[0] ) * scal;
1263  v[12] += ((-3*x[0]*invr5*f[3])+15*x[0]*x[1]*x[1]*invr7*f[3]-3*x[0]*invr5*fdotr+15*x[0]*x[1]*x[1]*invr7*fdotr+invr3*f[0]-3*x[1]*x[1]*invr5*f[0] ) * scal;
1264  v[13] += ((-9*x[1]*invr5*f[3])+15*x[1]*x[1]*x[1]*invr7*f[3]-9*x[1]*invr5*fdotr+15*x[1]*x[1]*x[1]*invr7*fdotr+invr3*f[1]-3*x[1]*x[1]*invr5*f[1] ) * scal;
1265  v[14] += ((-3*x[2]*invr5*f[3])+15*x[1]*x[1]*x[2]*invr7*f[3]-3*x[2]*invr5*fdotr+15*x[1]*x[1]*x[2]*invr7*fdotr+invr3*f[2]-3*x[1]*x[1]*invr5*f[2] ) * scal;
1266  v[15] += (15*x[0]*x[1]*x[2]*invr7*f[3]+15*x[0]*x[1]*x[2]*invr7*fdotr-3*x[0]*x[1]*invr5*f[2]+3*x[0]*x[2]*invr5*f[1]-3*x[1]*x[2]*invr5*f[0] ) * scal;
1267  v[16] += ((-3*x[2]*invr5*f[3])+15*x[1]*x[1]*x[2]*invr7*f[3]-3*x[2]*invr5*fdotr+15*x[1]*x[1]*x[2]*invr7*fdotr+invr3*f[2]-3*x[1]*x[1]*invr5*f[2] ) * scal;
1268  v[17] += ((-3*x[1]*invr5*f[3])+15*x[1]*x[2]*x[2]*invr7*f[3]-3*x[1]*invr5*fdotr+15*x[1]*x[2]*x[2]*invr7*fdotr-6*x[1]*x[2]*invr5*f[2]-invr3*f[1]+3*x[2]*x[2]*invr5*f[1]) * scal;
1269  v[18] += ((-3*x[2]*invr5*f[3])+15*x[0]*x[0]*x[2]*invr7*f[3]-3*x[2]*invr5*fdotr+15*x[0]*x[0]*x[2]*invr7*fdotr-invr3*f[2]+3*x[0]*x[0]*invr5*f[2]-6*x[0]*x[2]*invr5*f[0]) * scal;
1270  v[19] += (15*x[0]*x[1]*x[2]*invr7*f[3]+15*x[0]*x[1]*x[2]*invr7*fdotr+3*x[0]*x[1]*invr5*f[2]-3*x[0]*x[2]*invr5*f[1]-3*x[1]*x[2]*invr5*f[0] ) * scal;
1271  v[20] += ((-3*x[0]*invr5*f[3])+15*x[0]*x[2]*x[2]*invr7*f[3]-3*x[0]*invr5*fdotr+15*x[0]*x[2]*x[2]*invr7*fdotr+invr3*f[0]-3*x[2]*x[2]*invr5*f[0] ) * scal;
1272  v[21] += (15*x[0]*x[1]*x[2]*invr7*f[3]+15*x[0]*x[1]*x[2]*invr7*fdotr+3*x[0]*x[1]*invr5*f[2]-3*x[0]*x[2]*invr5*f[1]-3*x[1]*x[2]*invr5*f[0] ) * scal;
1273  v[22] += ((-3*x[2]*invr5*f[3])+15*x[1]*x[1]*x[2]*invr7*f[3]-3*x[2]*invr5*fdotr+15*x[1]*x[1]*x[2]*invr7*fdotr-invr3*f[2]+3*x[1]*x[1]*invr5*f[2]-6*x[1]*x[2]*invr5*f[1]) * scal;
1274  v[23] += ((-3*x[1]*invr5*f[3])+15*x[1]*x[2]*x[2]*invr7*f[3]-3*x[1]*invr5*fdotr+15*x[1]*x[2]*x[2]*invr7*fdotr+invr3*f[1]-3*x[2]*x[2]*invr5*f[1] ) * scal;
1275  v[24] += ((-3*x[0]*invr5*f[3])+15*x[0]*x[2]*x[2]*invr7*f[3]-3*x[0]*invr5*fdotr+15*x[0]*x[2]*x[2]*invr7*fdotr+invr3*f[0]-3*x[2]*x[2]*invr5*f[0] ) * scal;
1276  v[25] += ((-3*x[1]*invr5*f[3])+15*x[1]*x[2]*x[2]*invr7*f[3]-3*x[1]*invr5*fdotr+15*x[1]*x[2]*x[2]*invr7*fdotr+invr3*f[1]-3*x[2]*x[2]*invr5*f[1] ) * scal;
1277  v[26] += ((-9*x[2]*invr5*f[3])+15*x[2]*x[2]*x[2]*invr7*f[3]-9*x[2]*invr5*fdotr+15*x[2]*x[2]*x[2]*invr7*fdotr+invr3*f[2]-3*x[2]*x[2]*invr5*f[2] ) * scal;
1278  };
1279  return KernelFromLambda<k_dim0, k_dim1>(__FUNCTION__, micro_ker);
1280  }
1281 
1282  inline static const KernelFunction<ValueType, DIM>& FSxPU() {
1283  constexpr Integer k_dim0 = DIM+1;
1284  constexpr Integer k_dim1 = DIM+1;
1285  static const ValueType scal = 1.0/(8.0*const_pi<ValueType>());
1286  static auto micro_ker = [&](ValueType* v, const ValueType* x, ValueType invr, const ValueType* n, const ValueType* f) {
1287  ValueType fdotr = 0;
1288  ValueType invr2 = invr*invr;
1289  ValueType invr3 = invr2*invr;
1290  ValueType invr5 = invr2*invr3;
1291  ValueType invr7 = invr2*invr5;
1292  for(Integer k=0;k<DIM;k++) fdotr += x[k] * f[k];
1293 
1294  v[0] += (2*invr3*fdotr ) * scal;
1295  v[1] += (x[0]*invr3*f[3]+x[0]*invr3*fdotr+f[0]*invr) * scal;
1296  v[2] += (x[1]*invr3*f[3]+x[1]*invr3*fdotr+f[1]*invr) * scal;
1297  v[3] += (x[2]*invr3*f[3]+x[2]*invr3*fdotr+f[2]*invr) * scal;
1298  };
1299  return KernelFromLambda<k_dim0, k_dim1>(__FUNCTION__, micro_ker);
1300  }
1301 
1303 
1304  inline static const KernelFunction<ValueType, DIM>& DxP() {
1305  constexpr Integer k_dim0 = DIM;
1306  constexpr Integer k_dim1 = 1;
1307  static const ValueType scal = 1.0/(8.0*const_pi<ValueType>());
1308  static auto micro_ker = [&](ValueType* v, const ValueType* x, ValueType invr, const ValueType* n, const ValueType* f) {
1309  ValueType fdotr = 0;
1310  ValueType ndotr = 0;
1311  ValueType ndotf = 0;
1312  ValueType invr2 = invr*invr;
1313  ValueType invr3 = invr2*invr;
1314  ValueType invr5 = invr2*invr3;
1315  ValueType invr7 = invr2*invr5;
1316  for(Integer k=0;k<DIM;k++) fdotr += f[k] * x[k];
1317  for(Integer k=0;k<DIM;k++) ndotr += n[k] * x[k];
1318  for(Integer k=0;k<DIM;k++) ndotf += n[k] * f[k];
1319 
1320  v[0] += 4*(invr3*(-ndotf)+3*invr5*fdotr*ndotr) * scal;
1321  };
1322  return KernelFromLambda<k_dim0, k_dim1>(__FUNCTION__, micro_ker);
1323  }
1324 
1325  inline static const KernelFunction<ValueType, DIM>& DxdP() {
1326  constexpr Integer k_dim0 = DIM;
1327  constexpr Integer k_dim1 = DIM;
1328  static const ValueType scal = 1.0/(8.0*const_pi<ValueType>());
1329  static auto micro_ker = [&](ValueType* v, const ValueType* x, ValueType invr, const ValueType* n, const ValueType* f) {
1330  ValueType fdotr = 0;
1331  ValueType ndotr = 0;
1332  ValueType ndotf = 0;
1333  ValueType invr2 = invr*invr;
1334  ValueType invr3 = invr2*invr;
1335  ValueType invr5 = invr2*invr3;
1336  ValueType invr7 = invr2*invr5;
1337  for(Integer k=0;k<DIM;k++) fdotr += f[k] * x[k];
1338  for(Integer k=0;k<DIM;k++) ndotr += n[k] * x[k];
1339  for(Integer k=0;k<DIM;k++) ndotf += n[k] * f[k];
1340 
1341  v[0] += 4*((-3*x[0]*invr5*(-ndotf))-15*x[0]*invr7*fdotr*ndotr+3*invr5*f[0]*ndotr+3*invr5*fdotr*n[0]) * scal;
1342  v[1] += 4*((-3*x[1]*invr5*(-ndotf))-15*x[1]*invr7*fdotr*ndotr+3*invr5*f[1]*ndotr+3*invr5*fdotr*n[1]) * scal;
1343  v[2] += 4*((-3*x[2]*invr5*(-ndotf))-15*x[2]*invr7*fdotr*ndotr+3*invr5*f[2]*ndotr+3*invr5*fdotr*n[2]) * scal;
1344  };
1345  return KernelFromLambda<k_dim0, k_dim1>(__FUNCTION__, micro_ker);
1346  }
1347 
1348  inline static const KernelFunction<ValueType, DIM>& DxU() {
1349  constexpr Integer k_dim0 = DIM;
1350  constexpr Integer k_dim1 = DIM;
1351  static const ValueType scal = 1.0/(8.0*const_pi<ValueType>());
1352  static auto micro_ker = [&](ValueType* v, const ValueType* x, ValueType invr, const ValueType* n, const ValueType* f) {
1353  ValueType fdotr = 0;
1354  ValueType ndotr = 0;
1355  ValueType ndotf = 0;
1356  ValueType invr2 = invr*invr;
1357  ValueType invr3 = invr2*invr;
1358  ValueType invr5 = invr2*invr3;
1359  ValueType invr7 = invr2*invr5;
1360  for(Integer k=0;k<DIM;k++) fdotr += f[k] * x[k];
1361  for(Integer k=0;k<DIM;k++) ndotr += n[k] * x[k];
1362  for(Integer k=0;k<DIM;k++) ndotf += n[k] * f[k];
1363 
1364  v[0] += (6*x[0]*invr5*fdotr*ndotr) * scal;
1365  v[1] += (6*x[1]*invr5*fdotr*ndotr) * scal;
1366  v[2] += (6*x[2]*invr5*fdotr*ndotr) * scal;
1367  };
1368  return KernelFromLambda<k_dim0, k_dim1>(__FUNCTION__, micro_ker);
1369  }
1370 
1371  inline static const KernelFunction<ValueType, DIM>& DxdU() {
1372  constexpr Integer k_dim0 = DIM;
1373  constexpr Integer k_dim1 = DIM * DIM;
1374  static const ValueType scal = 1.0/(8.0*const_pi<ValueType>());
1375  static auto micro_ker = [&](ValueType* v, const ValueType* x, ValueType invr, const ValueType* n, const ValueType* f) {
1376  ValueType fdotr = 0;
1377  ValueType ndotr = 0;
1378  ValueType ndotf = 0;
1379  ValueType invr2 = invr*invr;
1380  ValueType invr3 = invr2*invr;
1381  ValueType invr5 = invr2*invr3;
1382  ValueType invr7 = invr2*invr5;
1383  for(Integer k=0;k<DIM;k++) fdotr += f[k] * x[k];
1384  for(Integer k=0;k<DIM;k++) ndotr += n[k] * x[k];
1385  for(Integer k=0;k<DIM;k++) ndotf += n[k] * f[k];
1386 
1387  v[0] += (6*invr5*fdotr*ndotr-30*x[0]*x[0]*invr7*fdotr*ndotr+6*x[0]*invr5*f[0]*ndotr+6*x[0]*invr5*fdotr*n[0]) * scal;
1388  v[1] += ((-30*x[0]*x[1]*invr7*fdotr*ndotr)+6*x[0]*invr5*f[1]*ndotr+6*x[0]*invr5*fdotr*n[1] ) * scal;
1389  v[2] += ((-30*x[0]*x[2]*invr7*fdotr*ndotr)+6*x[0]*invr5*f[2]*ndotr+6*x[0]*invr5*fdotr*n[2] ) * scal;
1390  v[3] += ((-30*x[0]*x[1]*invr7*fdotr*ndotr)+6*x[1]*invr5*f[0]*ndotr+6*x[1]*invr5*fdotr*n[0] ) * scal;
1391  v[4] += (6*invr5*fdotr*ndotr-30*x[1]*x[1]*invr7*fdotr*ndotr+6*x[1]*invr5*f[1]*ndotr+6*x[1]*invr5*fdotr*n[1]) * scal;
1392  v[5] += ((-30*x[1]*x[2]*invr7*fdotr*ndotr)+6*x[1]*invr5*f[2]*ndotr+6*x[1]*invr5*fdotr*n[2] ) * scal;
1393  v[6] += ((-30*x[0]*x[2]*invr7*fdotr*ndotr)+6*x[2]*invr5*f[0]*ndotr+6*x[2]*invr5*fdotr*n[0] ) * scal;
1394  v[7] += ((-30*x[1]*x[2]*invr7*fdotr*ndotr)+6*x[2]*invr5*f[1]*ndotr+6*x[2]*invr5*fdotr*n[1] ) * scal;
1395  v[8] += (6*invr5*fdotr*ndotr-30*x[2]*x[2]*invr7*fdotr*ndotr+6*x[2]*invr5*f[2]*ndotr+6*x[2]*invr5*fdotr*n[2]) * scal;
1396  };
1397  return KernelFromLambda<k_dim0, k_dim1>(__FUNCTION__, micro_ker);
1398  }
1399 
1400  inline static const KernelFunction<ValueType, DIM>& Dxd2U() {
1401  constexpr Integer k_dim0 = DIM;
1402  constexpr Integer k_dim1 = DIM * DIM * DIM;
1403  static const ValueType scal = 1.0/(8.0*const_pi<ValueType>());
1404  static auto micro_ker = [&](ValueType* v, const ValueType* x, ValueType invr, const ValueType* n, const ValueType* f) {
1405  ValueType fdotr = 0;
1406  ValueType ndotr = 0;
1407  ValueType ndotf = 0;
1408  ValueType invr2 = invr*invr;
1409  ValueType invr3 = invr2*invr;
1410  ValueType invr5 = invr2*invr3;
1411  ValueType invr7 = invr2*invr5;
1412  ValueType invr9 = invr2*invr7;
1413  for(Integer k=0;k<DIM;k++) fdotr += f[k] * x[k];
1414  for(Integer k=0;k<DIM;k++) ndotr += n[k] * x[k];
1415  for(Integer k=0;k<DIM;k++) ndotf += n[k] * f[k];
1416 
1417  v[ 0] += ((-90*x[0]*invr7*fdotr*ndotr)+210*x[0]*x[0]*x[0]*invr9*fdotr*ndotr+12*invr5*f[0]*ndotr-60*x[0]*x[0]*invr7*f[0]*ndotr+12*invr5*fdotr*n[0]-60*x[0]*x[0]*invr7*fdotr*n[0]+12*x[0]*invr5*f[0]*n[0] ) * scal;
1418  v[ 1] += ((-30*x[1]*invr7*fdotr*ndotr)+210*x[0]*x[0]*x[1]*invr9*fdotr*ndotr+6*invr5*f[1]*ndotr-30*x[0]*x[0]*invr7*f[1]*ndotr-30*x[0]*x[1]*invr7*f[0]*ndotr+6*invr5*fdotr*n[1]-30*x[0]*x[0]*invr7*fdotr*n[1]+6*x[0]*invr5*f[0]*n[1]-30*x[0]*x[1]*invr7*fdotr*n[0]+6*x[0]*invr5*f[1]*n[0]) * scal;
1419  v[ 2] += ((-30*x[2]*invr7*fdotr*ndotr)+210*x[0]*x[0]*x[2]*invr9*fdotr*ndotr+6*invr5*f[2]*ndotr-30*x[0]*x[0]*invr7*f[2]*ndotr-30*x[0]*x[2]*invr7*f[0]*ndotr+6*invr5*fdotr*n[2]-30*x[0]*x[0]*invr7*fdotr*n[2]+6*x[0]*invr5*f[0]*n[2]-30*x[0]*x[2]*invr7*fdotr*n[0]+6*x[0]*invr5*f[2]*n[0]) * scal;
1420  v[ 3] += ((-30*x[1]*invr7*fdotr*ndotr)+210*x[0]*x[0]*x[1]*invr9*fdotr*ndotr+6*invr5*f[1]*ndotr-30*x[0]*x[0]*invr7*f[1]*ndotr-30*x[0]*x[1]*invr7*f[0]*ndotr+6*invr5*fdotr*n[1]-30*x[0]*x[0]*invr7*fdotr*n[1]+6*x[0]*invr5*f[0]*n[1]-30*x[0]*x[1]*invr7*fdotr*n[0]+6*x[0]*invr5*f[1]*n[0]) * scal;
1421  v[ 4] += ((-30*x[0]*invr7*fdotr*ndotr)+210*x[0]*x[1]*x[1]*invr9*fdotr*ndotr-60*x[0]*x[1]*invr7*f[1]*ndotr-60*x[0]*x[1]*invr7*fdotr*n[1]+12*x[0]*invr5*f[1]*n[1] ) * scal;
1422  v[ 5] += (210*x[0]*x[1]*x[2]*invr9*fdotr*ndotr-30*x[0]*x[1]*invr7*f[2]*ndotr-30*x[0]*x[2]*invr7*f[1]*ndotr-30*x[0]*x[1]*invr7*fdotr*n[2]+6*x[0]*invr5*f[1]*n[2]-30*x[0]*x[2]*invr7*fdotr*n[1]+6*x[0]*invr5*f[2]*n[1] ) * scal;
1423  v[ 6] += ((-30*x[2]*invr7*fdotr*ndotr)+210*x[0]*x[0]*x[2]*invr9*fdotr*ndotr+6*invr5*f[2]*ndotr-30*x[0]*x[0]*invr7*f[2]*ndotr-30*x[0]*x[2]*invr7*f[0]*ndotr+6*invr5*fdotr*n[2]-30*x[0]*x[0]*invr7*fdotr*n[2]+6*x[0]*invr5*f[0]*n[2]-30*x[0]*x[2]*invr7*fdotr*n[0]+6*x[0]*invr5*f[2]*n[0]) * scal;
1424  v[ 7] += (210*x[0]*x[1]*x[2]*invr9*fdotr*ndotr-30*x[0]*x[1]*invr7*f[2]*ndotr-30*x[0]*x[2]*invr7*f[1]*ndotr-30*x[0]*x[1]*invr7*fdotr*n[2]+6*x[0]*invr5*f[1]*n[2]-30*x[0]*x[2]*invr7*fdotr*n[1]+6*x[0]*invr5*f[2]*n[1] ) * scal;
1425  v[ 8] += ((-30*x[0]*invr7*fdotr*ndotr)+210*x[0]*x[2]*x[2]*invr9*fdotr*ndotr-60*x[0]*x[2]*invr7*f[2]*ndotr-60*x[0]*x[2]*invr7*fdotr*n[2]+12*x[0]*invr5*f[2]*n[2] ) * scal;
1426  v[ 9] += ((-30*x[1]*invr7*fdotr*ndotr)+210*x[0]*x[0]*x[1]*invr9*fdotr*ndotr-60*x[0]*x[1]*invr7*f[0]*ndotr-60*x[0]*x[1]*invr7*fdotr*n[0]+12*x[1]*invr5*f[0]*n[0] ) * scal;
1427  v[10] += ((-30*x[0]*invr7*fdotr*ndotr)+210*x[0]*x[1]*x[1]*invr9*fdotr*ndotr-30*x[0]*x[1]*invr7*f[1]*ndotr+6*invr5*f[0]*ndotr-30*x[1]*x[1]*invr7*f[0]*ndotr-30*x[0]*x[1]*invr7*fdotr*n[1]+6*x[1]*invr5*f[0]*n[1]+6*invr5*fdotr*n[0]-30*x[1]*x[1]*invr7*fdotr*n[0]+6*x[1]*invr5*f[1]*n[0]) * scal;
1428  v[11] += (210*x[0]*x[1]*x[2]*invr9*fdotr*ndotr-30*x[0]*x[1]*invr7*f[2]*ndotr-30*x[1]*x[2]*invr7*f[0]*ndotr-30*x[0]*x[1]*invr7*fdotr*n[2]+6*x[1]*invr5*f[0]*n[2]-30*x[1]*x[2]*invr7*fdotr*n[0]+6*x[1]*invr5*f[2]*n[0] ) * scal;
1429  v[12] += ((-30*x[0]*invr7*fdotr*ndotr)+210*x[0]*x[1]*x[1]*invr9*fdotr*ndotr-30*x[0]*x[1]*invr7*f[1]*ndotr+6*invr5*f[0]*ndotr-30*x[1]*x[1]*invr7*f[0]*ndotr-30*x[0]*x[1]*invr7*fdotr*n[1]+6*x[1]*invr5*f[0]*n[1]+6*invr5*fdotr*n[0]-30*x[1]*x[1]*invr7*fdotr*n[0]+6*x[1]*invr5*f[1]*n[0]) * scal;
1430  v[13] += ((-90*x[1]*invr7*fdotr*ndotr)+210*x[1]*x[1]*x[1]*invr9*fdotr*ndotr+12*invr5*f[1]*ndotr-60*x[1]*x[1]*invr7*f[1]*ndotr+12*invr5*fdotr*n[1]-60*x[1]*x[1]*invr7*fdotr*n[1]+12*x[1]*invr5*f[1]*n[1] ) * scal;
1431  v[14] += ((-30*x[2]*invr7*fdotr*ndotr)+210*x[1]*x[1]*x[2]*invr9*fdotr*ndotr+6*invr5*f[2]*ndotr-30*x[1]*x[1]*invr7*f[2]*ndotr-30*x[1]*x[2]*invr7*f[1]*ndotr+6*invr5*fdotr*n[2]-30*x[1]*x[1]*invr7*fdotr*n[2]+6*x[1]*invr5*f[1]*n[2]-30*x[1]*x[2]*invr7*fdotr*n[1]+6*x[1]*invr5*f[2]*n[1]) * scal;
1432  v[15] += (210*x[0]*x[1]*x[2]*invr9*fdotr*ndotr-30*x[0]*x[1]*invr7*f[2]*ndotr-30*x[1]*x[2]*invr7*f[0]*ndotr-30*x[0]*x[1]*invr7*fdotr*n[2]+6*x[1]*invr5*f[0]*n[2]-30*x[1]*x[2]*invr7*fdotr*n[0]+6*x[1]*invr5*f[2]*n[0] ) * scal;
1433  v[16] += ((-30*x[2]*invr7*fdotr*ndotr)+210*x[1]*x[1]*x[2]*invr9*fdotr*ndotr+6*invr5*f[2]*ndotr-30*x[1]*x[1]*invr7*f[2]*ndotr-30*x[1]*x[2]*invr7*f[1]*ndotr+6*invr5*fdotr*n[2]-30*x[1]*x[1]*invr7*fdotr*n[2]+6*x[1]*invr5*f[1]*n[2]-30*x[1]*x[2]*invr7*fdotr*n[1]+6*x[1]*invr5*f[2]*n[1]) * scal;
1434  v[17] += ((-30*x[1]*invr7*fdotr*ndotr)+210*x[1]*x[2]*x[2]*invr9*fdotr*ndotr-60*x[1]*x[2]*invr7*f[2]*ndotr-60*x[1]*x[2]*invr7*fdotr*n[2]+12*x[1]*invr5*f[2]*n[2] ) * scal;
1435  v[18] += ((-30*x[2]*invr7*fdotr*ndotr)+210*x[0]*x[0]*x[2]*invr9*fdotr*ndotr-60*x[0]*x[2]*invr7*f[0]*ndotr-60*x[0]*x[2]*invr7*fdotr*n[0]+12*x[2]*invr5*f[0]*n[0] ) * scal;
1436  v[19] += (210*x[0]*x[1]*x[2]*invr9*fdotr*ndotr-30*x[0]*x[2]*invr7*f[1]*ndotr-30*x[1]*x[2]*invr7*f[0]*ndotr-30*x[0]*x[2]*invr7*fdotr*n[1]+6*x[2]*invr5*f[0]*n[1]-30*x[1]*x[2]*invr7*fdotr*n[0]+6*x[2]*invr5*f[1]*n[0] ) * scal;
1437  v[20] += ((-30*x[0]*invr7*fdotr*ndotr)+210*x[0]*x[2]*x[2]*invr9*fdotr*ndotr-30*x[0]*x[2]*invr7*f[2]*ndotr+6*invr5*f[0]*ndotr-30*x[2]*x[2]*invr7*f[0]*ndotr-30*x[0]*x[2]*invr7*fdotr*n[2]+6*x[2]*invr5*f[0]*n[2]+6*invr5*fdotr*n[0]-30*x[2]*x[2]*invr7*fdotr*n[0]+6*x[2]*invr5*f[2]*n[0]) * scal;
1438  v[21] += (210*x[0]*x[1]*x[2]*invr9*fdotr*ndotr-30*x[0]*x[2]*invr7*f[1]*ndotr-30*x[1]*x[2]*invr7*f[0]*ndotr-30*x[0]*x[2]*invr7*fdotr*n[1]+6*x[2]*invr5*f[0]*n[1]-30*x[1]*x[2]*invr7*fdotr*n[0]+6*x[2]*invr5*f[1]*n[0] ) * scal;
1439  v[22] += ((-30*x[2]*invr7*fdotr*ndotr)+210*x[1]*x[1]*x[2]*invr9*fdotr*ndotr-60*x[1]*x[2]*invr7*f[1]*ndotr-60*x[1]*x[2]*invr7*fdotr*n[1]+12*x[2]*invr5*f[1]*n[1] ) * scal;
1440  v[23] += ((-30*x[1]*invr7*fdotr*ndotr)+210*x[1]*x[2]*x[2]*invr9*fdotr*ndotr-30*x[1]*x[2]*invr7*f[2]*ndotr+6*invr5*f[1]*ndotr-30*x[2]*x[2]*invr7*f[1]*ndotr-30*x[1]*x[2]*invr7*fdotr*n[2]+6*x[2]*invr5*f[1]*n[2]+6*invr5*fdotr*n[1]-30*x[2]*x[2]*invr7*fdotr*n[1]+6*x[2]*invr5*f[2]*n[1]) * scal;
1441  v[24] += ((-30*x[0]*invr7*fdotr*ndotr)+210*x[0]*x[2]*x[2]*invr9*fdotr*ndotr-30*x[0]*x[2]*invr7*f[2]*ndotr+6*invr5*f[0]*ndotr-30*x[2]*x[2]*invr7*f[0]*ndotr-30*x[0]*x[2]*invr7*fdotr*n[2]+6*x[2]*invr5*f[0]*n[2]+6*invr5*fdotr*n[0]-30*x[2]*x[2]*invr7*fdotr*n[0]+6*x[2]*invr5*f[2]*n[0]) * scal;
1442  v[25] += ((-30*x[1]*invr7*fdotr*ndotr)+210*x[1]*x[2]*x[2]*invr9*fdotr*ndotr-30*x[1]*x[2]*invr7*f[2]*ndotr+6*invr5*f[1]*ndotr-30*x[2]*x[2]*invr7*f[1]*ndotr-30*x[1]*x[2]*invr7*fdotr*n[2]+6*x[2]*invr5*f[1]*n[2]+6*invr5*fdotr*n[1]-30*x[2]*x[2]*invr7*fdotr*n[1]+6*x[2]*invr5*f[2]*n[1]) * scal;
1443  v[26] += ((-90*x[2]*invr7*fdotr*ndotr)+210*x[2]*x[2]*x[2]*invr9*fdotr*ndotr+12*invr5*f[2]*ndotr-60*x[2]*x[2]*invr7*f[2]*ndotr+12*invr5*fdotr*n[2]-60*x[2]*x[2]*invr7*fdotr*n[2]+12*x[2]*invr5*f[2]*n[2] ) * scal;
1444  };
1445  return KernelFromLambda<k_dim0, k_dim1>(__FUNCTION__, micro_ker);
1446  }
1447 
1448  inline static const KernelFunction<ValueType, DIM>& DxPU() {
1449  constexpr Integer k_dim0 = DIM;
1450  constexpr Integer k_dim1 = DIM+1;
1451  static const ValueType scal = 1.0/(8.0*const_pi<ValueType>());
1452  static auto micro_ker = [&](ValueType* v, const ValueType* x, ValueType invr, const ValueType* n, const ValueType* f) {
1453  ValueType fdotr = 0;
1454  ValueType ndotr = 0;
1455  ValueType ndotf = 0;
1456  ValueType invr2 = invr*invr;
1457  ValueType invr3 = invr2*invr;
1458  ValueType invr5 = invr2*invr3;
1459  ValueType invr7 = invr2*invr5;
1460  for(Integer k=0;k<DIM;k++) fdotr += f[k] * x[k];
1461  for(Integer k=0;k<DIM;k++) ndotr += n[k] * x[k];
1462  for(Integer k=0;k<DIM;k++) ndotf += n[k] * f[k];
1463 
1464  v[0] += 4*(invr3*(-ndotf)+3*invr5*fdotr*ndotr) * scal;
1465  v[1] += (6*x[0]*invr5*fdotr*ndotr) * scal;
1466  v[2] += (6*x[1]*invr5*fdotr*ndotr) * scal;
1467  v[3] += (6*x[2]*invr5*fdotr*ndotr) * scal;
1468  };
1469  return KernelFromLambda<k_dim0, k_dim1>(__FUNCTION__, micro_ker);
1470  }
1471 
1472  private:
1473 
1474  static void GenKer(const Vector<ValueType>& r_src, const Vector<ValueType>& n_src, const Vector<ValueType>& v_src, const Vector<ValueType>& r_trg, Vector<ValueType>& v_trg, const void* ctx){
1475  const auto& ker = *(std::function<typename KernelFunction<ValueType, DIM>::KerFn>*)(ctx);
1476  ker(r_src, n_src, v_src, r_trg, v_trg, NULL);
1477  }
1478 
1479  template <Integer k_dim0, Integer k_dim1, class LambdaType> inline static const KernelFunction<ValueType, DIM>& KernelFromLambda(std::string name, LambdaType&& micro_ker) {
1480  static std::function<typename KernelFunction<ValueType, DIM>::KerFn> ker_fn = [&](const Vector<ValueType>& r_src, const Vector<ValueType>& n_src, const Vector<ValueType>& v_src, const Vector<ValueType>& r_trg, Vector<ValueType>& v_trg, const void* ctx){
1481  Long src_cnt = r_src.Dim() / DIM;
1482  Long trg_cnt = r_trg.Dim() / DIM;
1483  for (Long t=0;t<trg_cnt;t++){
1484  ValueType v[k_dim1];
1485  for (Integer i=0;i<k_dim1;i++) {
1486  v[i]=0;
1487  }
1488  for (Long s=0;s<src_cnt;s++){
1489  ValueType dx[DIM], f[k_dim0], r2 = 0, invr = 0;
1490  for(Integer k=0;k<DIM;k++) {
1491  dx[k] = r_trg[t*DIM+k] - r_src[s*DIM+k];
1492  r2 += dx[k] * dx[k];
1493  }
1494  for(Integer k=0;k<k_dim0;k++) {
1495  f[k] = v_src[s*k_dim0+k];
1496  }
1497  if(r2>0) invr=1.0/sqrt(r2);
1498  micro_ker(v, dx, invr, &n_src[s*DIM], f);
1499  }
1500  for (Integer i=0;i<k_dim1;i++) {
1501  v_trg[t*k_dim1+i] += v[i];
1502  }
1503  }
1504  };
1505  static KernelFunction<ValueType, DIM> ker(GenKer, k_dim0, k_dim1, name, &ker_fn);
1506  return ker;
1507  }
1508 
1510 
1511  public:
1512  inline static const KernelFunction<ValueType, DIM>& single_layer_velocity() {
1513  static KernelFunction<ValueType, DIM> ker(sl_vel<2>, DIM, DIM, "stokes-vel-sl");
1514  return ker;
1515  }
1516 
1517  inline static const KernelFunction<ValueType, DIM>& single_layer_velocity_m2x() {
1518  static KernelFunction<ValueType, DIM> ker(sl_vel_m2x<2>, DIM + 1, DIM, "stokes-vel-sl-m2x");
1519  return ker;
1520  }
1521 
1522  inline static const KernelFunction<ValueType, DIM>& double_layer_velocity() {
1523  static KernelFunction<ValueType, DIM> ker(dl_vel<2>, DIM, DIM, "stokes-vel-dl");
1524  return ker;
1525  }
1526 
1527  inline static const KernelFunction<ValueType, DIM>& single_layer_pressure() {
1528  static KernelFunction<ValueType, DIM> ker(sl_press<2>, DIM, 1, "stokes-press-sl");
1529  return ker;
1530  }
1531 
1532  inline static const KernelFunction<ValueType, DIM>& single_layer_pressure_m2x() {
1533  static KernelFunction<ValueType, DIM> ker(sl_press_m2x<2>, DIM + 1, 1, "stokes-press-sl-m2x");
1534  return ker;
1535  }
1536 
1537  protected:
1538  template <class Vec = ValueType, Vec (*RSQRT_INTRIN)(Vec) = rsqrt_intrin0<Vec>> static void sl_vel_uKernel(const Matrix<ValueType>& src_coord, const Matrix<ValueType>& src_norml, const Matrix<ValueType>& src_value, const Matrix<ValueType>& trg_coord, Matrix<ValueType>& trg_value) {
1539  #define SRC_BLK 500
1540  static ValueType eps = machine_eps() * 128;
1541  size_t VecLen = sizeof(Vec) / sizeof(ValueType);
1542 
1544  size_t NWTN_ITER = 0;
1545  if (RSQRT_INTRIN == (Vec (*)(Vec))rsqrt_intrin0<Vec, ValueType>) NWTN_ITER = 0;
1546  if (RSQRT_INTRIN == (Vec (*)(Vec))rsqrt_intrin1<Vec, ValueType>) NWTN_ITER = 1;
1547  if (RSQRT_INTRIN == (Vec (*)(Vec))rsqrt_intrin2<Vec, ValueType>) NWTN_ITER = 2;
1548  if (RSQRT_INTRIN == (Vec (*)(Vec))rsqrt_intrin3<Vec, ValueType>) NWTN_ITER = 3;
1549 
1550  ValueType nwtn_scal = 1; // scaling factor for newton iterations
1551  for (int i = 0; i < NWTN_ITER; i++) {
1552  nwtn_scal = 2 * nwtn_scal * nwtn_scal * nwtn_scal;
1553  }
1554  const ValueType OOEP = 1.0 / (8 * nwtn_scal * const_pi<ValueType>());
1555  Vec inv_nwtn_scal2 = set_intrin<Vec, ValueType>(1.0 / (nwtn_scal * nwtn_scal));
1556 
1557  size_t src_cnt_ = src_coord.Dim(1);
1558  size_t trg_cnt_ = trg_coord.Dim(1);
1559  for (size_t sblk = 0; sblk < src_cnt_; sblk += SRC_BLK) {
1560  size_t src_cnt = src_cnt_ - sblk;
1561  if (src_cnt > SRC_BLK) src_cnt = SRC_BLK;
1562  for (size_t t = 0; t < trg_cnt_; t += VecLen) {
1563  Vec tx = load_intrin<Vec>(&trg_coord[0][t]);
1564  Vec ty = load_intrin<Vec>(&trg_coord[1][t]);
1565  Vec tz = load_intrin<Vec>(&trg_coord[2][t]);
1566 
1567  Vec tvx = zero_intrin<Vec>();
1568  Vec tvy = zero_intrin<Vec>();
1569  Vec tvz = zero_intrin<Vec>();
1570  for (size_t s = sblk; s < sblk + src_cnt; s++) {
1571  Vec dx = sub_intrin(tx, bcast_intrin<Vec>(&src_coord[0][s]));
1572  Vec dy = sub_intrin(ty, bcast_intrin<Vec>(&src_coord[1][s]));
1573  Vec dz = sub_intrin(tz, bcast_intrin<Vec>(&src_coord[2][s]));
1574 
1575  Vec svx = bcast_intrin<Vec>(&src_value[0][s]);
1576  Vec svy = bcast_intrin<Vec>(&src_value[1][s]);
1577  Vec svz = bcast_intrin<Vec>(&src_value[2][s]);
1578 
1579  Vec r2 = mul_intrin(dx, dx);
1580  r2 = add_intrin(r2, mul_intrin(dy, dy));
1581  r2 = add_intrin(r2, mul_intrin(dz, dz));
1582  r2 = and_intrin(cmplt_intrin(set_intrin<Vec, ValueType>(eps), r2), r2);
1583 
1584  Vec rinv = RSQRT_INTRIN(r2);
1585  Vec rinv2 = mul_intrin(mul_intrin(rinv, rinv), inv_nwtn_scal2);
1586 
1587  Vec inner_prod = mul_intrin(svx, dx);
1588  inner_prod = add_intrin(inner_prod, mul_intrin(svy, dy));
1589  inner_prod = add_intrin(inner_prod, mul_intrin(svz, dz));
1590  inner_prod = mul_intrin(inner_prod, rinv2);
1591 
1592  tvx = add_intrin(tvx, mul_intrin(rinv, add_intrin(svx, mul_intrin(dx, inner_prod))));
1593  tvy = add_intrin(tvy, mul_intrin(rinv, add_intrin(svy, mul_intrin(dy, inner_prod))));
1594  tvz = add_intrin(tvz, mul_intrin(rinv, add_intrin(svz, mul_intrin(dz, inner_prod))));
1595  }
1596  Vec ooep = set_intrin<Vec, ValueType>(OOEP);
1597 
1598  tvx = add_intrin(mul_intrin(tvx, ooep), load_intrin<Vec>(&trg_value[0][t]));
1599  tvy = add_intrin(mul_intrin(tvy, ooep), load_intrin<Vec>(&trg_value[1][t]));
1600  tvz = add_intrin(mul_intrin(tvz, ooep), load_intrin<Vec>(&trg_value[2][t]));
1601 
1602  store_intrin(&trg_value[0][t], tvx);
1603  store_intrin(&trg_value[1][t], tvy);
1604  store_intrin(&trg_value[2][t], tvz);
1605  }
1606  }
1607 
1608  { // Add FLOPS
1609  #ifndef __MIC__
1610  //Profile::Add_FLOP((long long)trg_cnt_ * (long long)src_cnt_ * (29 + 4 * (NWTN_ITER)));
1611  #endif
1612  }
1613  #undef SRC_BLK
1614  }
1615 
1616  template <class Vec = ValueType, Vec (*RSQRT_INTRIN)(Vec) = rsqrt_intrin0<Vec>> static void sl_vel_m2x_uKernel(const Matrix<ValueType>& src_coord, const Matrix<ValueType>& src_norml, const Matrix<ValueType>& src_value, const Matrix<ValueType>& trg_coord, Matrix<ValueType>& trg_value) {
1617  #define SRC_BLK 500
1618  size_t VecLen = sizeof(Vec) / sizeof(ValueType);
1619 
1621  size_t NWTN_ITER = 0;
1622  if (RSQRT_INTRIN == (Vec (*)(Vec))rsqrt_intrin0<Vec, ValueType>) NWTN_ITER = 0;
1623  if (RSQRT_INTRIN == (Vec (*)(Vec))rsqrt_intrin1<Vec, ValueType>) NWTN_ITER = 1;
1624  if (RSQRT_INTRIN == (Vec (*)(Vec))rsqrt_intrin2<Vec, ValueType>) NWTN_ITER = 2;
1625  if (RSQRT_INTRIN == (Vec (*)(Vec))rsqrt_intrin3<Vec, ValueType>) NWTN_ITER = 3;
1626 
1627  ValueType nwtn_scal = 1; // scaling factor for newton iterations
1628  for (int i = 0; i < NWTN_ITER; i++) {
1629  nwtn_scal = 2 * nwtn_scal * nwtn_scal * nwtn_scal;
1630  }
1631  const ValueType OOEP = 1.0 / (8 * nwtn_scal * const_pi<ValueType>());
1632  Vec inv_nwtn_scal2 = set_intrin<Vec, ValueType>(1.0 / (nwtn_scal * nwtn_scal));
1633 
1634  size_t src_cnt_ = src_coord.Dim(1);
1635  size_t trg_cnt_ = trg_coord.Dim(1);
1636  for (size_t sblk = 0; sblk < src_cnt_; sblk += SRC_BLK) {
1637  size_t src_cnt = src_cnt_ - sblk;
1638  if (src_cnt > SRC_BLK) src_cnt = SRC_BLK;
1639  for (size_t t = 0; t < trg_cnt_; t += VecLen) {
1640  Vec tx = load_intrin<Vec>(&trg_coord[0][t]);
1641  Vec ty = load_intrin<Vec>(&trg_coord[1][t]);
1642  Vec tz = load_intrin<Vec>(&trg_coord[2][t]);
1643 
1644  Vec tvx = zero_intrin<Vec>();
1645  Vec tvy = zero_intrin<Vec>();
1646  Vec tvz = zero_intrin<Vec>();
1647  for (size_t s = sblk; s < sblk + src_cnt; s++) {
1648  Vec dx = sub_intrin(tx, bcast_intrin<Vec>(&src_coord[0][s]));
1649  Vec dy = sub_intrin(ty, bcast_intrin<Vec>(&src_coord[1][s]));
1650  Vec dz = sub_intrin(tz, bcast_intrin<Vec>(&src_coord[2][s]));
1651 
1652  Vec svx = bcast_intrin<Vec>(&src_value[0][s]);
1653  Vec svy = bcast_intrin<Vec>(&src_value[1][s]);
1654  Vec svz = bcast_intrin<Vec>(&src_value[2][s]);
1655  Vec inner_prod = bcast_intrin<Vec>(&src_value[3][s]);
1656 
1657  Vec r2 = mul_intrin(dx, dx);
1658  r2 = add_intrin(r2, mul_intrin(dy, dy));
1659  r2 = add_intrin(r2, mul_intrin(dz, dz));
1660 
1661  Vec rinv = RSQRT_INTRIN(r2);
1662  Vec rinv2 = mul_intrin(mul_intrin(rinv, rinv), inv_nwtn_scal2);
1663 
1664  inner_prod = add_intrin(inner_prod, mul_intrin(svx, dx));
1665  inner_prod = add_intrin(inner_prod, mul_intrin(svy, dy));
1666  inner_prod = add_intrin(inner_prod, mul_intrin(svz, dz));
1667  inner_prod = mul_intrin(inner_prod, rinv2);
1668 
1669  tvx = add_intrin(tvx, mul_intrin(rinv, add_intrin(svx, mul_intrin(dx, inner_prod))));
1670  tvy = add_intrin(tvy, mul_intrin(rinv, add_intrin(svy, mul_intrin(dy, inner_prod))));
1671  tvz = add_intrin(tvz, mul_intrin(rinv, add_intrin(svz, mul_intrin(dz, inner_prod))));
1672  }
1673  Vec ooep = set_intrin<Vec, ValueType>(OOEP);
1674 
1675  tvx = add_intrin(mul_intrin(tvx, ooep), load_intrin<Vec>(&trg_value[0][t]));
1676  tvy = add_intrin(mul_intrin(tvy, ooep), load_intrin<Vec>(&trg_value[1][t]));
1677  tvz = add_intrin(mul_intrin(tvz, ooep), load_intrin<Vec>(&trg_value[2][t]));
1678 
1679  store_intrin(&trg_value[0][t], tvx);
1680  store_intrin(&trg_value[1][t], tvy);
1681  store_intrin(&trg_value[2][t], tvz);
1682  }
1683  }
1684 
1685  { // Add FLOPS
1686  #ifndef __MIC__
1687  //Profile::Add_FLOP((long long)trg_cnt_ * (long long)src_cnt_ * (29 + 4 * (NWTN_ITER)));
1688  #endif
1689  }
1690  #undef SRC_BLK
1691  }
1692 
1693  template <class Vec = ValueType, Vec (*RSQRT_INTRIN)(Vec) = rsqrt_intrin0<Vec>> static void dl_vel_uKernel(const Matrix<ValueType>& src_coord, const Matrix<ValueType>& src_norml, const Matrix<ValueType>& src_value, const Matrix<ValueType>& trg_coord, Matrix<ValueType>& trg_value) {
1694  #define SRC_BLK 500
1695  static ValueType eps = machine_eps() * 128;
1696  size_t VecLen = sizeof(Vec) / sizeof(ValueType);
1697 
1699  size_t NWTN_ITER = 0;
1700  if (RSQRT_INTRIN == (Vec (*)(Vec))rsqrt_intrin0<Vec, ValueType>) NWTN_ITER = 0;
1701  if (RSQRT_INTRIN == (Vec (*)(Vec))rsqrt_intrin1<Vec, ValueType>) NWTN_ITER = 1;
1702  if (RSQRT_INTRIN == (Vec (*)(Vec))rsqrt_intrin2<Vec, ValueType>) NWTN_ITER = 2;
1703  if (RSQRT_INTRIN == (Vec (*)(Vec))rsqrt_intrin3<Vec, ValueType>) NWTN_ITER = 3;
1704 
1705  ValueType nwtn_scal = 1; // scaling factor for newton iterations
1706  for (int i = 0; i < NWTN_ITER; i++) {
1707  nwtn_scal = 2 * nwtn_scal * nwtn_scal * nwtn_scal;
1708  }
1709  const ValueType SCAL_CONST = 3.0 / (4.0 * nwtn_scal * nwtn_scal * nwtn_scal * nwtn_scal * nwtn_scal * const_pi<ValueType>());
1710 
1711  size_t src_cnt_ = src_coord.Dim(1);
1712  size_t trg_cnt_ = trg_coord.Dim(1);
1713  for (size_t sblk = 0; sblk < src_cnt_; sblk += SRC_BLK) {
1714  size_t src_cnt = src_cnt_ - sblk;
1715  if (src_cnt > SRC_BLK) src_cnt = SRC_BLK;
1716  for (size_t t = 0; t < trg_cnt_; t += VecLen) {
1717  Vec tx = load_intrin<Vec>(&trg_coord[0][t]);
1718  Vec ty = load_intrin<Vec>(&trg_coord[1][t]);
1719  Vec tz = load_intrin<Vec>(&trg_coord[2][t]);
1720 
1721  Vec tvx = zero_intrin<Vec>();
1722  Vec tvy = zero_intrin<Vec>();
1723  Vec tvz = zero_intrin<Vec>();
1724  for (size_t s = sblk; s < sblk + src_cnt; s++) {
1725  Vec dx = sub_intrin(tx, bcast_intrin<Vec>(&src_coord[0][s]));
1726  Vec dy = sub_intrin(ty, bcast_intrin<Vec>(&src_coord[1][s]));
1727  Vec dz = sub_intrin(tz, bcast_intrin<Vec>(&src_coord[2][s]));
1728 
1729  Vec snx = bcast_intrin<Vec>(&src_value[0][s]);
1730  Vec sny = bcast_intrin<Vec>(&src_value[1][s]);
1731  Vec snz = bcast_intrin<Vec>(&src_value[2][s]);
1732 
1733  Vec svx = bcast_intrin<Vec>(&src_norml[0][s]);
1734  Vec svy = bcast_intrin<Vec>(&src_norml[1][s]);
1735  Vec svz = bcast_intrin<Vec>(&src_norml[2][s]);
1736 
1737  Vec r2 = mul_intrin(dx, dx);
1738  r2 = add_intrin(r2, mul_intrin(dy, dy));
1739  r2 = add_intrin(r2, mul_intrin(dz, dz));
1740  r2 = and_intrin(cmplt_intrin(set_intrin<Vec, ValueType>(eps), r2), r2);
1741 
1742  Vec rinv = RSQRT_INTRIN(r2);
1743  Vec rinv2 = mul_intrin(rinv, rinv);
1744  Vec rinv5 = mul_intrin(mul_intrin(rinv2, rinv2), rinv);
1745 
1746  Vec r_dot_n = mul_intrin(snx, dx);
1747  r_dot_n = add_intrin(r_dot_n, mul_intrin(sny, dy));
1748  r_dot_n = add_intrin(r_dot_n, mul_intrin(snz, dz));
1749 
1750  Vec r_dot_f = mul_intrin(svx, dx);
1751  r_dot_f = add_intrin(r_dot_f, mul_intrin(svy, dy));
1752  r_dot_f = add_intrin(r_dot_f, mul_intrin(svz, dz));
1753 
1754  Vec p = mul_intrin(mul_intrin(r_dot_n, r_dot_f), rinv5);
1755  tvx = add_intrin(tvx, mul_intrin(dx, p));
1756  tvy = add_intrin(tvy, mul_intrin(dy, p));
1757  tvz = add_intrin(tvz, mul_intrin(dz, p));
1758  }
1759  Vec scal_const = set_intrin<Vec, ValueType>(SCAL_CONST);
1760 
1761  tvx = add_intrin(mul_intrin(tvx, scal_const), load_intrin<Vec>(&trg_value[0][t]));
1762  tvy = add_intrin(mul_intrin(tvy, scal_const), load_intrin<Vec>(&trg_value[1][t]));
1763  tvz = add_intrin(mul_intrin(tvz, scal_const), load_intrin<Vec>(&trg_value[2][t]));
1764 
1765  store_intrin(&trg_value[0][t], tvx);
1766  store_intrin(&trg_value[1][t], tvy);
1767  store_intrin(&trg_value[2][t], tvz);
1768  }
1769  }
1770 
1771  { // Add FLOPS
1772  #ifndef __MIC__
1773  //Profile::Add_FLOP((long long)trg_cnt_ * (long long)src_cnt_ * (31 + 4 * (NWTN_ITER)));
1774  #endif
1775  }
1776  #undef SRC_BLK
1777  }
1778 
1779  template <class Vec = ValueType, Vec (*RSQRT_INTRIN)(Vec) = rsqrt_intrin0<Vec>> static void sl_press_uKernel(const Matrix<ValueType>& src_coord, const Matrix<ValueType>& src_norml, const Matrix<ValueType>& src_value, const Matrix<ValueType>& trg_coord, Matrix<ValueType>& trg_value) {
1780  #define SRC_BLK 500
1781  static ValueType eps = machine_eps() * 128;
1782  size_t VecLen = sizeof(Vec) / sizeof(ValueType);
1783 
1785  size_t NWTN_ITER = 0;
1786  if (RSQRT_INTRIN == (Vec (*)(Vec))rsqrt_intrin0<Vec, ValueType>) NWTN_ITER = 0;
1787  if (RSQRT_INTRIN == (Vec (*)(Vec))rsqrt_intrin1<Vec, ValueType>) NWTN_ITER = 1;
1788  if (RSQRT_INTRIN == (Vec (*)(Vec))rsqrt_intrin2<Vec, ValueType>) NWTN_ITER = 2;
1789  if (RSQRT_INTRIN == (Vec (*)(Vec))rsqrt_intrin3<Vec, ValueType>) NWTN_ITER = 3;
1790 
1791  ValueType nwtn_scal = 1; // scaling factor for newton iterations
1792  for (int i = 0; i < NWTN_ITER; i++) {
1793  nwtn_scal = 2 * nwtn_scal * nwtn_scal * nwtn_scal;
1794  }
1795  const ValueType OOEP = 1.0 / (4 * nwtn_scal * nwtn_scal * nwtn_scal * const_pi<ValueType>());
1796 
1797  size_t src_cnt_ = src_coord.Dim(1);
1798  size_t trg_cnt_ = trg_coord.Dim(1);
1799  for (size_t sblk = 0; sblk < src_cnt_; sblk += SRC_BLK) {
1800  size_t src_cnt = src_cnt_ - sblk;
1801  if (src_cnt > SRC_BLK) src_cnt = SRC_BLK;
1802  for (size_t t = 0; t < trg_cnt_; t += VecLen) {
1803  Vec tx = load_intrin<Vec>(&trg_coord[0][t]);
1804  Vec ty = load_intrin<Vec>(&trg_coord[1][t]);
1805  Vec tz = load_intrin<Vec>(&trg_coord[2][t]);
1806 
1807  Vec tv = zero_intrin<Vec>();
1808  for (size_t s = sblk; s < sblk + src_cnt; s++) {
1809  Vec dx = sub_intrin(tx, bcast_intrin<Vec>(&src_coord[0][s]));
1810  Vec dy = sub_intrin(ty, bcast_intrin<Vec>(&src_coord[1][s]));
1811  Vec dz = sub_intrin(tz, bcast_intrin<Vec>(&src_coord[2][s]));
1812 
1813  Vec svx = bcast_intrin<Vec>(&src_value[0][s]);
1814  Vec svy = bcast_intrin<Vec>(&src_value[1][s]);
1815  Vec svz = bcast_intrin<Vec>(&src_value[2][s]);
1816 
1817  Vec r2 = mul_intrin(dx, dx);
1818  r2 = add_intrin(r2, mul_intrin(dy, dy));
1819  r2 = add_intrin(r2, mul_intrin(dz, dz));
1820  r2 = and_intrin(cmplt_intrin(set_intrin<Vec, ValueType>(eps), r2), r2);
1821 
1822  Vec rinv = RSQRT_INTRIN(r2);
1823  Vec rinv3 = mul_intrin(mul_intrin(rinv, rinv), rinv);
1824 
1825  Vec inner_prod = mul_intrin(svx, dx);
1826  inner_prod = add_intrin(inner_prod, mul_intrin(svy, dy));
1827  inner_prod = add_intrin(inner_prod, mul_intrin(svz, dz));
1828  tv = add_intrin(tv, mul_intrin(inner_prod, rinv3));
1829  }
1830  Vec ooep = set_intrin<Vec, ValueType>(OOEP);
1831 
1832  tv = add_intrin(mul_intrin(tv, ooep), load_intrin<Vec>(&trg_value[0][t]));
1833  store_intrin(&trg_value[0][t], tv);
1834  }
1835  }
1836 
1837  { // Add FLOPS
1838  #ifndef __MIC__
1839  //Profile::Add_FLOP((long long)trg_cnt_ * (long long)src_cnt_ * (29 + 4 * (NWTN_ITER)));
1840  #endif
1841  }
1842  #undef SRC_BLK
1843  }
1844 
1845  template <class Vec = ValueType, Vec (*RSQRT_INTRIN)(Vec) = rsqrt_intrin0<Vec>> static void sl_press_m2x_uKernel(const Matrix<ValueType>& src_coord, const Matrix<ValueType>& src_norml, const Matrix<ValueType>& src_value, const Matrix<ValueType>& trg_coord, Matrix<ValueType>& trg_value) {
1846  #define SRC_BLK 500
1847  size_t VecLen = sizeof(Vec) / sizeof(ValueType);
1848 
1850  size_t NWTN_ITER = 0;
1851  if (RSQRT_INTRIN == (Vec (*)(Vec))rsqrt_intrin0<Vec, ValueType>) NWTN_ITER = 0;
1852  if (RSQRT_INTRIN == (Vec (*)(Vec))rsqrt_intrin1<Vec, ValueType>) NWTN_ITER = 1;
1853  if (RSQRT_INTRIN == (Vec (*)(Vec))rsqrt_intrin2<Vec, ValueType>) NWTN_ITER = 2;
1854  if (RSQRT_INTRIN == (Vec (*)(Vec))rsqrt_intrin3<Vec, ValueType>) NWTN_ITER = 3;
1855 
1856  ValueType nwtn_scal = 1; // scaling factor for newton iterations
1857  for (int i = 0; i < NWTN_ITER; i++) {
1858  nwtn_scal = 2 * nwtn_scal * nwtn_scal * nwtn_scal;
1859  }
1860  const ValueType OOEP = 1.0 / (4 * nwtn_scal * nwtn_scal * nwtn_scal * const_pi<ValueType>());
1861 
1862  size_t src_cnt_ = src_coord.Dim(1);
1863  size_t trg_cnt_ = trg_coord.Dim(1);
1864  for (size_t sblk = 0; sblk < src_cnt_; sblk += SRC_BLK) {
1865  size_t src_cnt = src_cnt_ - sblk;
1866  if (src_cnt > SRC_BLK) src_cnt = SRC_BLK;
1867  for (size_t t = 0; t < trg_cnt_; t += VecLen) {
1868  Vec tx = load_intrin<Vec>(&trg_coord[0][t]);
1869  Vec ty = load_intrin<Vec>(&trg_coord[1][t]);
1870  Vec tz = load_intrin<Vec>(&trg_coord[2][t]);
1871 
1872  Vec tv = zero_intrin<Vec>();
1873  for (size_t s = sblk; s < sblk + src_cnt; s++) {
1874  Vec dx = sub_intrin(tx, bcast_intrin<Vec>(&src_coord[0][s]));
1875  Vec dy = sub_intrin(ty, bcast_intrin<Vec>(&src_coord[1][s]));
1876  Vec dz = sub_intrin(tz, bcast_intrin<Vec>(&src_coord[2][s]));
1877 
1878  Vec svx = bcast_intrin<Vec>(&src_value[0][s]);
1879  Vec svy = bcast_intrin<Vec>(&src_value[1][s]);
1880  Vec svz = bcast_intrin<Vec>(&src_value[2][s]);
1881 
1882  Vec r2 = mul_intrin(dx, dx);
1883  r2 = add_intrin(r2, mul_intrin(dy, dy));
1884  r2 = add_intrin(r2, mul_intrin(dz, dz));
1885 
1886  Vec rinv = RSQRT_INTRIN(r2);
1887  Vec rinv3 = mul_intrin(mul_intrin(rinv, rinv), rinv);
1888 
1889  Vec inner_prod = mul_intrin(svx, dx);
1890  inner_prod = add_intrin(inner_prod, mul_intrin(svy, dy));
1891  inner_prod = add_intrin(inner_prod, mul_intrin(svz, dz));
1892  tv = add_intrin(tv, mul_intrin(inner_prod, rinv3));
1893  }
1894  Vec ooep = set_intrin<Vec, ValueType>(OOEP);
1895 
1896  tv = add_intrin(mul_intrin(tv, ooep), load_intrin<Vec>(&trg_value[0][t]));
1897  store_intrin(&trg_value[0][t], tv);
1898  }
1899  }
1900 
1901  { // Add FLOPS
1902  #ifndef __MIC__
1903  //Profile::Add_FLOP((long long)trg_cnt_ * (long long)src_cnt_ * (29 + 4 * (NWTN_ITER)));
1904  #endif
1905  }
1906  #undef SRC_BLK
1907  }
1908 
1909  private:
1910  static ValueType machine_eps() {
1911  ValueType eps = 1;
1912  while (eps * (ValueType)0.5 + (ValueType)1.0 > 1.0) eps *= 0.5;
1913  return eps;
1914  }
1915 
1916  template <Integer newton_iter = 0> static void sl_vel(const Vector<ValueType>& r_src, const Vector<ValueType>& n_src, const Vector<ValueType>& v_src, const Vector<ValueType>& r_trg, Vector<ValueType>& v_trg, const void* ctx) {
1917  #define PVFMM_KER_NWTN(nwtn) \
1918  if (newton_iter == nwtn) KernelFnWrapper::kernel_wrapper<ValueType, Real, Vec, DIM, 3, 3, Stokes3D<Real>::template sl_vel_uKernel<Vec, rsqrt_intrin##nwtn<Vec, Real>>>(r_src, n_src, v_src, r_trg, v_trg)
1919  #define PVFMM_KERNEL_MACRO \
1920  PVFMM_KER_NWTN(0); \
1921  PVFMM_KER_NWTN(1); \
1922  PVFMM_KER_NWTN(2); \
1923  PVFMM_KER_NWTN(3);
1925  typedef float Real;
1926  #if defined __MIC__
1927  #define Vec Real
1928  #elif defined __AVX__
1929  #define Vec __m256
1930  #elif defined __SSE3__
1931  #define Vec __m128
1932  #else
1933  #define Vec Real
1934  #endif
1935  PVFMM_KERNEL_MACRO;
1936  #undef Vec
1938  typedef double Real;
1939  #if defined __MIC__
1940  #define Vec Real
1941  #elif defined __AVX__
1942  #define Vec __m256d
1943  #elif defined __SSE3__
1944  #define Vec __m128d
1945  #else
1946  #define Vec Real
1947  #endif
1948  PVFMM_KERNEL_MACRO;
1949  #undef Vec
1950  } else {
1951  typedef ValueType Real;
1952  #define Vec Real
1953  PVFMM_KERNEL_MACRO;
1954  #undef Vec
1955  }
1956  #undef PVFMM_KER_NWTN
1957  }
1958 
1959  template <Integer newton_iter = 0> static void sl_vel_m2x(const Vector<ValueType>& r_src, const Vector<ValueType>& n_src, const Vector<ValueType>& v_src, const Vector<ValueType>& r_trg, Vector<ValueType>& v_trg, const void* ctx) {
1960  #define PVFMM_KER_NWTN(nwtn) \
1961  if (newton_iter == nwtn) KernelFnWrapper::kernel_wrapper<ValueType, Real, Vec, DIM, 4, 3, Stokes3D<Real>::template sl_vel_m2x_uKernel<Vec, rsqrt_intrin##nwtn<Vec, Real>>>(r_src, n_src, v_src, r_trg, v_trg)
1962  #define PVFMM_KERNEL_MACRO \
1963  PVFMM_KER_NWTN(0); \
1964  PVFMM_KER_NWTN(1); \
1965  PVFMM_KER_NWTN(2); \
1966  PVFMM_KER_NWTN(3);
1968  typedef float Real;
1969  #if defined __MIC__
1970  #define Vec Real
1971  #elif defined __AVX__
1972  #define Vec __m256
1973  #elif defined __SSE3__
1974  #define Vec __m128
1975  #else
1976  #define Vec Real
1977  #endif
1978  PVFMM_KERNEL_MACRO;
1979  #undef Vec
1981  typedef double Real;
1982  #if defined __MIC__
1983  #define Vec Real
1984  #elif defined __AVX__
1985  #define Vec __m256d
1986  #elif defined __SSE3__
1987  #define Vec __m128d
1988  #else
1989  #define Vec Real
1990  #endif
1991  PVFMM_KERNEL_MACRO;
1992  #undef Vec
1993  } else {
1994  typedef ValueType Real;
1995  #define Vec Real
1996  PVFMM_KERNEL_MACRO;
1997  #undef Vec
1998  }
1999  #undef PVFMM_KER_NWTN
2000  #undef PVFMM_KERNEL_MACRO
2001  }
2002 
2003  template <Integer newton_iter = 0> static void dl_vel(const Vector<ValueType>& r_src, const Vector<ValueType>& n_src, const Vector<ValueType>& v_src, const Vector<ValueType>& r_trg, Vector<ValueType>& v_trg, const void* ctx) {
2004  #define PVFMM_KER_NWTN(nwtn) \
2005  if (newton_iter == nwtn) KernelFnWrapper::kernel_wrapper<ValueType, Real, Vec, DIM, 3, 3, Stokes3D<Real>::template dl_vel_uKernel<Vec, rsqrt_intrin##nwtn<Vec, Real>>>(r_src, n_src, v_src, r_trg, v_trg)
2006  #define PVFMM_KERNEL_MACRO \
2007  PVFMM_KER_NWTN(0); \
2008  PVFMM_KER_NWTN(1); \
2009  PVFMM_KER_NWTN(2); \
2010  PVFMM_KER_NWTN(3);
2012  typedef float Real;
2013  #if defined __MIC__
2014  #define Vec Real
2015  #elif defined __AVX__
2016  #define Vec __m256
2017  #elif defined __SSE3__
2018  #define Vec __m128
2019  #else
2020  #define Vec Real
2021  #endif
2022  PVFMM_KERNEL_MACRO;
2023  #undef Vec
2025  typedef double Real;
2026  #if defined __MIC__
2027  #define Vec Real
2028  #elif defined __AVX__
2029  #define Vec __m256d
2030  #elif defined __SSE3__
2031  #define Vec __m128d
2032  #else
2033  #define Vec Real
2034  #endif
2035  PVFMM_KERNEL_MACRO;
2036  #undef Vec
2037  } else {
2038  typedef ValueType Real;
2039  #define Vec Real
2040  PVFMM_KERNEL_MACRO;
2041  #undef Vec
2042  }
2043  #undef PVFMM_KER_NWTN
2044  #undef PVFMM_KERNEL_MACRO
2045  }
2046 
2047  template <Integer newton_iter = 0> static void sl_press(const Vector<ValueType>& r_src, const Vector<ValueType>& n_src, const Vector<ValueType>& v_src, const Vector<ValueType>& r_trg, Vector<ValueType>& v_trg, const void* ctx) {
2048  #define PVFMM_KER_NWTN(nwtn) \
2049  if (newton_iter == nwtn) KernelFnWrapper::kernel_wrapper<ValueType, Real, Vec, DIM, 3, 1, Stokes3D<Real>::template sl_press_uKernel<Vec, rsqrt_intrin##nwtn<Vec, Real>>>(r_src, n_src, v_src, r_trg, v_trg)
2050  #define PVFMM_KERNEL_MACRO \
2051  PVFMM_KER_NWTN(0); \
2052  PVFMM_KER_NWTN(1); \
2053  PVFMM_KER_NWTN(2); \
2054  PVFMM_KER_NWTN(3);
2056  typedef float Real;
2057  #if defined __MIC__
2058  #define Vec Real
2059  #elif defined __AVX__
2060  #define Vec __m256
2061  #elif defined __SSE3__
2062  #define Vec __m128
2063  #else
2064  #define Vec Real
2065  #endif
2066  PVFMM_KERNEL_MACRO;
2067  #undef Vec
2069  typedef double Real;
2070  #if defined __MIC__
2071  #define Vec Real
2072  #elif defined __AVX__
2073  #define Vec __m256d
2074  #elif defined __SSE3__
2075  #define Vec __m128d
2076  #else
2077  #define Vec Real
2078  #endif
2079  PVFMM_KERNEL_MACRO;
2080  #undef Vec
2081  } else {
2082  typedef ValueType Real;
2083  #define Vec Real
2084  PVFMM_KERNEL_MACRO;
2085  #undef Vec
2086  }
2087  #undef PVFMM_KER_NWTN
2088  }
2089 
2090  template <Integer newton_iter = 0> static void sl_press_m2x(const Vector<ValueType>& r_src, const Vector<ValueType>& n_src, const Vector<ValueType>& v_src, const Vector<ValueType>& r_trg, Vector<ValueType>& v_trg, const void* ctx) {
2091  #define PVFMM_KER_NWTN(nwtn) \
2092  if (newton_iter == nwtn) KernelFnWrapper::kernel_wrapper<ValueType, Real, Vec, DIM, 4, 1, Stokes3D<Real>::template sl_press_m2x_uKernel<Vec, rsqrt_intrin##nwtn<Vec, Real>>>(r_src, n_src, v_src, r_trg, v_trg)
2093  #define PVFMM_KERNEL_MACRO \
2094  PVFMM_KER_NWTN(0); \
2095  PVFMM_KER_NWTN(1); \
2096  PVFMM_KER_NWTN(2); \
2097  PVFMM_KER_NWTN(3);
2099  typedef float Real;
2100  #if defined __MIC__
2101  #define Vec Real
2102  #elif defined __AVX__
2103  #define Vec __m256
2104  #elif defined __SSE3__
2105  #define Vec __m128
2106  #else
2107  #define Vec Real
2108  #endif
2109  PVFMM_KERNEL_MACRO;
2110  #undef Vec
2112  typedef double Real;
2113  #if defined __MIC__
2114  #define Vec Real
2115  #elif defined __AVX__
2116  #define Vec __m256d
2117  #elif defined __SSE3__
2118  #define Vec __m128d
2119  #else
2120  #define Vec Real
2121  #endif
2122  PVFMM_KERNEL_MACRO;
2123  #undef Vec
2124  } else {
2125  typedef ValueType Real;
2126  #define Vec Real
2127  PVFMM_KERNEL_MACRO;
2128  #undef Vec
2129  }
2130  #undef PVFMM_KER_NWTN
2131  #undef PVFMM_KERNEL_MACRO
2132  }
2133 
2134  friend class KernelFnWrapper;
2135 };
2136 
2137 template <class ValueType> struct Smoother {
2138  public:
2139  static const Integer DIM = 3;
2140 
2141  inline static const KernelFunction<ValueType, DIM>& ker3x3() {
2142  constexpr Integer k_dim0 = DIM;
2143  constexpr Integer k_dim1 = DIM;
2144  static const ValueType sigma2 = 0.0001;
2145  static const ValueType scal = pvfmm::pow(2.0 * const_pi<ValueType>() * sigma2, -(DIM - 1) * 0.5);
2146  static auto micro_ker = [&](ValueType* v, const ValueType* x, ValueType invr, const ValueType* n, const ValueType* f) {
2147  ValueType r2 = 1.0/(invr*invr);
2148  v[0] += exp(-r2/sigma2*0.5) * f[0] * scal;
2149  v[1] += exp(-r2/sigma2*0.5) * f[1] * scal;
2150  v[2] += exp(-r2/sigma2*0.5) * f[2] * scal;
2151  };
2152  return KernelFromLambda<k_dim0, k_dim1>(__FUNCTION__, micro_ker);
2153  }
2154 
2155  private:
2156  static void GenKer(const Vector<ValueType>& r_src, const Vector<ValueType>& n_src, const Vector<ValueType>& v_src, const Vector<ValueType>& r_trg, Vector<ValueType>& v_trg, const void* ctx){
2157  const auto& ker = *(std::function<typename KernelFunction<ValueType, DIM>::KerFn>*)(ctx);
2158  ker(r_src, n_src, v_src, r_trg, v_trg, NULL);
2159  }
2160 
2161  template <Integer k_dim0, Integer k_dim1, class LambdaType> inline static const KernelFunction<ValueType, DIM>& KernelFromLambda(std::string name, LambdaType&& micro_ker) {
2162  static std::function<typename KernelFunction<ValueType, DIM>::KerFn> ker_fn = [&](const Vector<ValueType>& r_src, const Vector<ValueType>& n_src, const Vector<ValueType>& v_src, const Vector<ValueType>& r_trg, Vector<ValueType>& v_trg, const void* ctx){
2163  Long src_cnt = r_src.Dim() / DIM;
2164  Long trg_cnt = r_trg.Dim() / DIM;
2165  for (Long t=0;t<trg_cnt;t++){
2166  ValueType v[k_dim1];
2167  for (Integer i=0;i<k_dim1;i++) {
2168  v[i]=0;
2169  }
2170  for (Long s=0;s<src_cnt;s++){
2171  ValueType dx[DIM], f[k_dim0], r2 = 0, invr = 0;
2172  for(Integer k=0;k<DIM;k++) {
2173  dx[k] = r_trg[t*DIM+k] - r_src[s*DIM+k];
2174  r2 += dx[k] * dx[k];
2175  }
2176  for(Integer k=0;k<k_dim0;k++) {
2177  f[k] = v_src[s*k_dim0+k];
2178  }
2179  if(r2>0) invr=1.0/sqrt(r2);
2180  micro_ker(v, dx, invr, &n_src[s*DIM], f);
2181  }
2182  for (Integer i=0;i<k_dim1;i++) {
2183  v_trg[t*k_dim1+i] += v[i];
2184  }
2185  }
2186  };
2187  static KernelFunction<ValueType, DIM> ker(GenKer, k_dim0, k_dim1, name, &ker_fn);
2188  return ker;
2189  }
2190 };
2191 
2192 } // end namespace
2193 
2194 #endif //_PVFMM_KERNEL_HPP_
Definition: matrix.hpp:13
Definition: cheb_utils.hpp:12
Definition: kernel.hpp:2137
Definition: matrix.hpp:15
Definition: kernel.hpp:1010
Definition: kernel.hpp:644
Definition: matrix.hpp:12
static void kernel_wrapper(const Vector< ValueType > &r_src, const Vector< ValueType > &n_src, const Vector< ValueType > &v_src, const Vector< ValueType > &r_trg, Vector< ValueType > &v_trg)
Generic kernel which rearranges data for vectorization, calls the actual uKernel and copies data to t...
Definition: kernel.hpp:650
Identify each type uniquely.
Definition: mem_mgr.hpp:271
Definition: kernel.hpp:768
Definition: kernel.hpp:14