1 #ifndef _PVFMM_KERNEL_HPP_ 2 #define _PVFMM_KERNEL_HPP_ 4 #include <pvfmm/intrin_wrapper.hpp> 5 #include <pvfmm/mem_mgr.hpp> 6 #include <pvfmm/matrix.hpp> 7 #include <pvfmm/vector.hpp> 8 #include <pvfmm/common.hpp> 19 KernelFunction(KerFn& ker_, Integer d_src, Integer d_trg, std::string ker_name_,
void* ctx_ = NULL,
bool verbose =
false) : ker(ker_), ctx(ctx_) {
20 ker_name = ker_name_ + std::to_string(
sizeof(ValueType) * 8);
27 virtual ~KernelFunction() {}
31 std::string Name()
const {
return ker_name; }
33 Integer Dim(Integer i)
const {
return ker_dim[i]; }
35 bool ScaleInvariant()
const {
return scale_invar; }
38 Long Nsrc = r_src.Dim() / DIM;
39 Long Ntrg = r_trg.Dim() / DIM;
40 PVFMM_ASSERT(r_src.Dim() == Nsrc * DIM);
41 PVFMM_ASSERT(r_trg.Dim() == Ntrg * DIM);
42 if (M.Dim(0) != Nsrc * ker_dim[0] || M.Dim(1) != Ntrg * ker_dim[1]) {
43 M.ReInit(Nsrc * ker_dim[0], Ntrg * ker_dim[1]);
46 Integer omp_p = omp_get_max_threads();
47 #pragma omp parallel for schedule(static) 48 for (Integer tid = 0; tid < omp_p; tid++) {
50 Long a = Nsrc * (tid + 0) / omp_p;
51 Long b = Nsrc * (tid + 1) / omp_p;
52 for (Long i = a; i < b; i++) {
54 r_src_.ReInit(DIM, (Iterator<ValueType>)r_src.Begin() + i * DIM,
false);
57 n_src_.ReInit(DIM, (Iterator<ValueType>)n_src.Begin() + i * DIM,
false);
59 for (Integer j = 0; j < ker_dim[0]; j++) {
65 v_trg.ReInit(Ntrg * ker_dim[1], M[i * ker_dim[0] + j],
false);
68 (*this)(r_src_, n_src_, v_src_, r_trg, v_trg);
75 PVFMM_ASSERT(idx < SymmTransCount);
76 return src_perm_vec[idx];
80 PVFMM_ASSERT(idx < SymmTransCount);
81 return trg_perm_vec[idx];
85 void Init(
bool verbose) {
87 while (eps + (ValueType)1.0 > 1.0) eps *= 0.5;
90 ValueType eps_ = N * N * eps;
99 ValueType abs_sum = 0;
100 for (Long i = 0; i < N; i++) {
104 for (Integer j = 0; j < DIM; j++) {
105 x[j] = (drand48() - 0.5);
108 r = pvfmm::sqrt<ValueType>(r);
110 for (Integer j = 0; j < DIM; j++) {
111 src_coord1[i * DIM + j] = x[j] * scal;
116 for (Integer j = 0; j < DIM; j++) {
117 x[j] = (drand48() - 0.5);
120 r = pvfmm::sqrt<ValueType>(r);
122 for (Integer j = 0; j < DIM; j++) {
123 src_norml1[i * DIM + j] = x[j] / r;
129 BuildMatrix(src_coord1, src_norml1, trg_coord, M);
131 for (Long i = 0; i < M1.Dim(0) * M1.Dim(1); i++) {
132 abs_sum += pvfmm::fabs<ValueType>(M1[0][i]);
134 if (abs_sum > pvfmm::sqrt<ValueType>(eps) || scal < eps)
break;
139 if (ker_dim[0] * ker_dim[1] > 0) {
143 for (Long i = 0; i < N * DIM; i++) {
144 src_coord2[i] = src_coord1[i] * 0.5;
145 src_norml2[i] = src_norml1[i];
150 BuildMatrix(src_coord2, src_norml2, trg_coord, M);
153 ValueType max_val = 0;
154 for (Long i = 0; i < ker_dim[0] * ker_dim[1]; i++) {
155 ValueType dot11 = 0, dot22 = 0;
156 for (Long j = 0; j < N; j++) {
157 dot11 += M1[j][i] * M1[j][i];
158 dot22 += M2[j][i] * M2[j][i];
160 max_val = std::max<ValueType>(max_val, dot11);
161 max_val = std::max<ValueType>(max_val, dot22);
166 for (Long i = 0; i < ker_dim[0] * ker_dim[1]; i++) {
167 ValueType dot11 = 0, dot12 = 0, dot22 = 0;
168 for (Long j = 0; j < N; j++) {
169 dot11 += M1[j][i] * M1[j][i];
170 dot12 += M1[j][i] * M2[j][i];
171 dot22 += M2[j][i] * M2[j][i];
173 if (dot11 > max_val * eps && dot22 > max_val * eps) {
174 PVFMM_ASSERT(dot12 > 0);
175 ValueType s = dot12 / dot11;
176 M_scal[0][i] = pvfmm::log<ValueType>(s) / pvfmm::log<ValueType>(2.0);
177 ValueType err = pvfmm::sqrt<ValueType>((dot22 / dot11) / (s * s) - 1.0);
183 }
else if (dot11 > max_val * eps || dot22 > max_val * eps) {
195 memcopy(b.Begin(), M_scal.Begin(), ker_dim[0] * ker_dim[1]);
196 b[ker_dim[0] * ker_dim[1]][0] = 0;
202 for (Long i0 = 0; i0 < ker_dim[0]; i0++) {
203 for (Long i1 = 0; i1 < ker_dim[1]; i1++) {
204 Long j = i0 * ker_dim[1] + i1;
207 M[j][i1 + ker_dim[0]] = 1;
211 M[ker_dim[0] * ker_dim[1]][0] = 1;
215 for (Long i0 = 0; i0 < ker_dim[0]; i0++) {
216 for (Long i1 = 0; i1 < ker_dim[1]; i1++) {
217 if (M_scal[i0][i1] >= 0) {
218 if (pvfmm::fabs<ValueType>(x[i0][0] + x[ker_dim[0] + i1][0] - M_scal[i0][i1]) > eps_) {
228 for (Long i = 0; i < ker_dim[0]; i++) {
229 P_src.scal[i] = pvfmm::pow<ValueType>(0.5, x[i][0]);
231 for (Long i = 0; i < ker_dim[1]; i++) {
232 P_trg.scal[i] = pvfmm::pow<ValueType>(0.5, x[ker_dim[0] + i][0]);
235 src_perm_vec[0] = P_src;
236 trg_perm_vec[0] = P_trg;
238 for (Long i = 0; i < ker_dim[0]; i++) {
239 P_src.scal[i] = pvfmm::pow<ValueType>(0.5, -x[i][0]);
241 for (Long i = 0; i < ker_dim[1]; i++) {
242 P_trg.scal[i] = pvfmm::pow<ValueType>(0.5, -x[ker_dim[0] + i][0]);
245 src_perm_vec[1] = P_src;
246 trg_perm_vec[1] = P_trg;
249 if (ker_dim[0] * ker_dim[1] > 0) {
250 for (Integer p_type = 2; p_type < SymmTransCount; p_type++) {
254 if (p_type < 2 + DIM) {
255 for (Long i = 0; i < N; i++) {
256 for (Integer j = 0; j < DIM; j++) {
257 if (p_type == 2 + j) {
258 src_coord2[i * DIM + j] = -src_coord1[i * DIM + j];
259 src_norml2[i * DIM + j] = -src_norml1[i * DIM + j];
261 src_coord2[i * DIM + j] = src_coord1[i * DIM + j];
262 src_norml2[i * DIM + j] = src_norml1[i * DIM + j];
267 Integer swap0, swap1;
269 Integer iter = 2 + DIM;
270 for (Integer i = 0; i < DIM; i++) {
271 for (Integer j = i + 1; j < DIM; j++) {
272 if (iter == p_type) {
280 for (Long i = 0; i < N; i++) {
281 for (Integer j = 0; j < DIM; j++) {
283 src_coord2[i * DIM + j] = src_coord1[i * DIM + swap1];
284 src_norml2[i * DIM + j] = src_norml1[i * DIM + swap1];
285 }
else if (j == swap1) {
286 src_coord2[i * DIM + j] = src_coord1[i * DIM + swap0];
287 src_norml2[i * DIM + j] = src_norml1[i * DIM + swap0];
289 src_coord2[i * DIM + j] = src_coord1[i * DIM + j];
290 src_norml2[i * DIM + j] = src_norml1[i * DIM + j];
298 BuildMatrix(src_coord2, src_norml2, trg_coord, M);
312 for (Long k = 0; k < N; k++) {
313 for (Long i = 0; i < ker_dim[0] * ker_dim[1]; i++) {
314 for (Long j = 0; j < ker_dim[0] * ker_dim[1]; j++) {
315 dot11[i][j] += M1[k][i] * M1[k][j];
316 dot12[i][j] += M1[k][i] * M2[k][j];
317 dot22[i][j] += M2[k][i] * M2[k][j];
321 for (Long i = 0; i < ker_dim[0] * ker_dim[1]; i++) {
322 norm1[i] = pvfmm::sqrt<ValueType>(dot11[i][i]);
323 norm2[i] = pvfmm::sqrt<ValueType>(dot22[i][i]);
325 for (Long i = 0; i < ker_dim[0] * ker_dim[1]; i++) {
326 for (Long j = 0; j < ker_dim[0] * ker_dim[1]; j++) {
327 dot11[i][j] /= (norm1[i] * norm1[j]);
328 dot12[i][j] /= (norm1[i] * norm2[j]);
329 dot22[i][j] /= (norm2[i] * norm2[j]);
335 M11.ReInit(ker_dim[0], ker_dim[1]);
336 M22.ReInit(ker_dim[0], ker_dim[1]);
339 for (Long i = 0; i < ker_dim[0] * ker_dim[1]; i++) {
340 if (norm1[i] > eps_ && M11[0][i] == 0) {
341 for (Long j = 0; j < ker_dim[0] * ker_dim[1]; j++) {
342 if (pvfmm::fabs<ValueType>(norm1[i] - norm1[j]) < eps_ && pvfmm::fabs<ValueType>(pvfmm::fabs<ValueType>(dot11[i][j]) - 1.0) < eps_) {
343 M11[0][j] = (dot11[i][j] > 0 ? flag : -flag);
345 if (pvfmm::fabs<ValueType>(norm1[i] - norm2[j]) < eps_ && pvfmm::fabs<ValueType>(pvfmm::fabs<ValueType>(dot12[i][j]) - 1.0) < eps_) {
346 M22[0][j] = (dot12[i][j] > 0 ? flag : -flag);
359 for (Long i = 0; i < M1.Dim(0); i++) {
360 for (Long j = 0; j < M1.Dim(1); j++) {
361 if (M1[i][j] < 0) M1[i][j] = -M1[i][j];
362 if (M2[i][j] < 0) M2[i][j] = -M2[i][j];
364 std::sort(M1[i], M1[i] + M1.Dim(1));
365 std::sort(M2[i], M2[i] + M2.Dim(1));
367 P.ReInit(M1.Dim(0), M1.Dim(0));
368 for (Long i = 0; i < M1.Dim(0); i++) {
369 for (Long j = 0; j < M1.Dim(0); j++) {
371 for (Long k = 0; k < M1.Dim(1); k++) {
372 if (M1[i][k] != M2[j][k]) {
384 for (Long i = 0; i < M1.Dim(0); i++) {
385 for (Long j = 0; j < M1.Dim(1); j++) {
386 if (M1[i][j] < 0) M1[i][j] = -M1[i][j];
387 if (M2[i][j] < 0) M2[i][j] = -M2[i][j];
389 std::sort(M1[i], M1[i] + M1.Dim(1));
390 std::sort(M2[i], M2[i] + M2.Dim(1));
392 P.ReInit(M1.Dim(0), M1.Dim(0));
393 for (Long i = 0; i < M1.Dim(0); i++) {
394 for (Long j = 0; j < M1.Dim(0); j++) {
396 for (Long k = 0; k < M1.Dim(1); k++) {
397 if (M1[i][k] != M2[j][k]) {
416 for (Long i = 0; i < P.Dim(); i++) {
417 for (Long j = 0; j < P.Dim(); j++) {
428 std::sort(&perm_tmp[0], &perm_tmp[0] + perm_tmp.Dim());
429 for (Long i = 0; i < perm_tmp.Dim(); i++) {
430 if (perm_tmp[i] != i)
break;
431 if (i == perm_tmp.Dim() - 1) {
437 for (Long i = 0; i < P.Dim(); i++) {
439 for (Long j = perm[i] + 1; j < P.Dim(); j++) {
445 if (perm[i] > tmp)
break;
446 for (Long j = 0; j < P.Dim(); j++) {
452 if (i == P.Dim() - 1) last =
true;
466 for (Long i = 0; i < P.Dim(); i++) {
467 for (Long j = 0; j < P.Dim(); j++) {
478 std::sort(&perm_tmp[0], &perm_tmp[0] + perm_tmp.Dim());
479 for (Long i = 0; i < perm_tmp.Dim(); i++) {
480 if (perm_tmp[i] != i)
break;
481 if (i == perm_tmp.Dim() - 1) {
487 for (Long i = 0; i < P.Dim(); i++) {
489 for (Long j = perm[i] + 1; j < P.Dim(); j++) {
495 if (perm[i] > tmp)
break;
496 for (Long j = 0; j < P.Dim(); j++) {
502 if (i == P.Dim() - 1) last =
true;
512 for (Long i = 0; i < M1.Dim(0); i++) {
513 for (Long j = 0; j < M1.Dim(1); j++) {
514 if (M1[i][j] < 0) M1[i][j] = -M1[i][j];
515 if (M2[i][j] < 0) M2[i][j] = -M2[i][j];
520 for (Long i = 0; i < P1vec.Dim(); i++) {
521 for (Long j = 0; j < P2vec.Dim(); j++) {
522 M = P1vec[i] * M2 * P2vec[j];
523 for (Long k = 0; k < M.Dim(0) * M.Dim(1); k++) {
524 if (M[0][k] != M1[0][k])
break;
525 if (k == M.Dim(0) * M.Dim(1) - 1) {
526 P1vec_.PushBack(P1vec[i]);
527 P2vec_.PushBack(P2vec[j]);
539 for (Long k = 0; k < P1vec.Dim(); k++) {
547 M[M1.Dim(0) * M1.Dim(1)][0] = 1.0;
548 for (Long i = 0; i < M1.Dim(0); i++) {
549 for (Long j = 0; j < M1.Dim(1); j++) {
550 Long k = i * M1.Dim(1) + j;
552 M[k][M1.Dim(0) + j] = -M2[i][j];
559 for (Long i = 0; i < M1.Dim(0); i++) {
560 P1_.scal[i] = (M[i][M1.Dim(0) * M1.Dim(1)] > 0 ? 1 : -1);
562 for (Long i = 0; i < M1.Dim(1); i++) {
563 P2_.scal[i] = (M[M1.Dim(0) + i][M1.Dim(0) * M1.Dim(1)] > 0 ? 1 : -1);
571 for (Long i = 0; i < Merr.Dim(0) * Merr.Dim(1); i++) {
580 for (Long i = 0; i < P1.Dim(); i++) {
581 if (P1_.perm[i] != P1.perm[i] || P1_.scal[i] != P1.scal[i]) {
586 for (Long i = 0; i < P2.Dim(); i++) {
587 if (P2_.perm[i] != P2.perm[i] || P2_.scal[i] != P2.scal[i]) {
596 for (Long i = 0; i < P1.Dim(); i++) {
597 P1_.perm[i] = P1.perm[i];
598 P1_.scal[i] = P1.scal[i];
600 for (Long i = 0; i < P2.Dim(); i++) {
601 P2_.perm[i] = P2.perm[i];
602 P2_.scal[i] = P2.scal[i];
607 assert(P1_.Dim() && P2_.Dim());
609 src_perm_vec[p_type] = P1_;
610 trg_perm_vec[p_type] = P2_;
612 for (Integer i = 2; i < SymmTransCount; i++) {
613 PVFMM_ASSERT_MSG(src_perm_vec[i].Dim() && trg_perm_vec[i].Dim(),
"no-symmetry for: " << ker_name);
618 std::cout <<
"Kernel Name : " << ker_name <<
'\n';
619 std::cout <<
"Precision : " << (double)eps <<
'\n';
620 std::cout <<
"Scale Invariant: " << (scale_invar ?
"yes" :
"no") <<
'\n';
621 if (scale_invar && ker_dim[0] * ker_dim[1] > 0) {
622 std::cout <<
"Scaling Matrix :\n";
625 for (Long i = 0; i < ker_dim[0]; i++) Src[i][0] = 1.0 / src_perm_vec[0].scal[i];
626 for (Long i = 0; i < ker_dim[1]; i++) Trg[0][i] = 1.0 / trg_perm_vec[0].scal[i];
627 std::cout << Src* Trg;
631 PVFMM_ASSERT(scale_invar);
636 std::string ker_name;
637 StaticArray<Integer, 2> ker_dim;
640 static const Integer SymmTransCount = 2 + DIM + DIM * (DIM - 1) / 2;
641 StaticArray<Permutation<ValueType>, SymmTransCount> src_perm_vec, trg_perm_vec;
650 template <
class ValueType,
class Real,
class Vec, Integer DIM, Integer SRC_DIM, Integer TRG_DIM,
void (*uKernel)(const Matrix<Real>&, const Matrix<Real>&, const Matrix<Real>&, const Matrix<Real>&, Matrix<Real>&)>
static void kernel_wrapper(
const Vector<ValueType>& r_src,
const Vector<ValueType>& n_src,
const Vector<ValueType>& v_src,
const Vector<ValueType>& r_trg,
Vector<ValueType>& v_trg) {
651 Integer VecLen =
sizeof(Vec) /
sizeof(Real);
652 Long src_cnt = r_src.Dim() / DIM;
653 Long trg_cnt = r_trg.Dim() / DIM;
655 #define STACK_BUFF_SIZE 4096 656 StaticArray<Real, STACK_BUFF_SIZE + PVFMM_MEM_ALIGN> stack_buff;
657 Iterator<Real> buff = NULL;
659 Matrix<Real> src_coord, src_norml, src_value, trg_coord, trg_value;
661 Long src_cnt_ = ((src_cnt + VecLen - 1) / VecLen) * VecLen;
662 Long trg_cnt_ = ((trg_cnt + VecLen - 1) / VecLen) * VecLen;
664 Iterator<Real> buff_ptr = NULL;
667 if (r_src.Dim()) buff_size += src_cnt_ * DIM;
668 if (n_src.Dim()) buff_size += src_cnt_ * DIM;
669 if (v_src.Dim()) buff_size += src_cnt_ * SRC_DIM;
670 if (r_trg.Dim()) buff_size += trg_cnt_ * DIM;
671 if (v_trg.Dim()) buff_size += trg_cnt_ * TRG_DIM;
672 if (buff_size > STACK_BUFF_SIZE) {
673 buff = aligned_new<Real>(buff_size);
676 const uintptr_t ptr = (uintptr_t) & stack_buff[0];
677 const uintptr_t ALIGN_MINUS_ONE = PVFMM_MEM_ALIGN - 1;
678 const uintptr_t NOT_ALIGN_MINUS_ONE = ~ALIGN_MINUS_ONE;
679 const uintptr_t offset = ((ptr + ALIGN_MINUS_ONE) & NOT_ALIGN_MINUS_ONE) - ptr;
680 buff_ptr = (Iterator<Real>)((Iterator<char>)stack_buff + offset);
684 src_coord.ReInit(DIM, src_cnt_, buff_ptr,
false);
685 buff_ptr += DIM * src_cnt_;
687 for (; i < src_cnt; i++) {
688 for (Long j = 0; j < DIM; j++) {
689 src_coord[j][i] = r_src[i * DIM + j];
692 for (; i < src_cnt_; i++) {
693 for (Long j = 0; j < DIM; j++) {
699 src_norml.ReInit(DIM, src_cnt_, buff_ptr,
false);
700 buff_ptr += DIM * src_cnt_;
702 for (; i < src_cnt; i++) {
703 for (Long j = 0; j < DIM; j++) {
704 src_norml[j][i] = n_src[i * DIM + j];
707 for (; i < src_cnt_; i++) {
708 for (Long j = 0; j < DIM; j++) {
714 src_value.ReInit(SRC_DIM, src_cnt_, buff_ptr,
false);
715 buff_ptr += SRC_DIM * src_cnt_;
717 for (; i < src_cnt; i++) {
718 for (Long j = 0; j < SRC_DIM; j++) {
719 src_value[j][i] = v_src[i * SRC_DIM + j];
722 for (; i < src_cnt_; i++) {
723 for (Long j = 0; j < SRC_DIM; j++) {
729 trg_coord.ReInit(DIM, trg_cnt_, buff_ptr,
false);
730 buff_ptr += DIM * trg_cnt_;
732 for (; i < trg_cnt; i++) {
733 for (Long j = 0; j < DIM; j++) {
734 trg_coord[j][i] = r_trg[i * DIM + j];
737 for (; i < trg_cnt_; i++) {
738 for (Long j = 0; j < DIM; j++) {
744 trg_value.ReInit(TRG_DIM, trg_cnt_, buff_ptr,
false);
745 buff_ptr += TRG_DIM * trg_cnt_;
747 for (; i < trg_cnt_; i++) {
748 for (Long j = 0; j < TRG_DIM; j++) {
754 uKernel(src_coord, src_norml, src_value, trg_coord, trg_value);
756 for (Long i = 0; i < trg_cnt; i++) {
757 for (Long j = 0; j < TRG_DIM; j++) {
758 v_trg[i * TRG_DIM + j] += trg_value[j][i];
763 aligned_delete<Real>(buff);
770 static const Integer DIM = 3;
795 Integer VecLen =
sizeof(Vec) /
sizeof(ValueType);
798 Integer NWTN_ITER = 0;
799 if (RSQRT_INTRIN == (Vec (*)(Vec))rsqrt_intrin0<Vec, ValueType>) NWTN_ITER = 0;
800 if (RSQRT_INTRIN == (Vec (*)(Vec))rsqrt_intrin1<Vec, ValueType>) NWTN_ITER = 1;
801 if (RSQRT_INTRIN == (Vec (*)(Vec))rsqrt_intrin2<Vec, ValueType>) NWTN_ITER = 2;
802 if (RSQRT_INTRIN == (Vec (*)(Vec))rsqrt_intrin3<Vec, ValueType>) NWTN_ITER = 3;
804 ValueType nwtn_scal = 1;
805 for (Integer i = 0; i < NWTN_ITER; i++) {
806 nwtn_scal = 2 * nwtn_scal * nwtn_scal * nwtn_scal;
808 const ValueType OOFP = 1.0 / (4 * nwtn_scal * const_pi<ValueType>());
810 Long src_cnt_ = src_coord.Dim(1);
811 Long trg_cnt_ = trg_coord.Dim(1);
812 for (Long sblk = 0; sblk < src_cnt_; sblk += SRC_BLK) {
813 Long src_cnt = src_cnt_ - sblk;
814 if (src_cnt > SRC_BLK) src_cnt = SRC_BLK;
815 for (Long t = 0; t < trg_cnt_; t += VecLen) {
816 Vec tx = load_intrin<Vec>(&trg_coord[0][t]);
817 Vec ty = load_intrin<Vec>(&trg_coord[1][t]);
818 Vec tz = load_intrin<Vec>(&trg_coord[2][t]);
819 Vec tv = zero_intrin<Vec>();
820 for (Long s = sblk; s < sblk + src_cnt; s++) {
821 Vec dx = sub_intrin(tx, bcast_intrin<Vec>(&src_coord[0][s]));
822 Vec dy = sub_intrin(ty, bcast_intrin<Vec>(&src_coord[1][s]));
823 Vec dz = sub_intrin(tz, bcast_intrin<Vec>(&src_coord[2][s]));
824 Vec sv = bcast_intrin<Vec>(&src_value[0][s]);
826 Vec r2 = mul_intrin(dx, dx);
827 r2 = add_intrin(r2, mul_intrin(dy, dy));
828 r2 = add_intrin(r2, mul_intrin(dz, dz));
830 Vec rinv = RSQRT_INTRIN(r2);
831 tv = add_intrin(tv, mul_intrin(rinv, sv));
833 Vec oofp = set_intrin<Vec, ValueType>(OOFP);
834 tv = add_intrin(mul_intrin(tv, oofp), load_intrin<Vec>(&trg_value[0][t]));
835 store_intrin(&trg_value[0][t], tv);
849 #define PVFMM_KER_NWTN(nwtn) \ 850 if (newton_iter == nwtn) KernelFnWrapper::kernel_wrapper<ValueType, Real, Vec, DIM, 1, 1, Laplace3D<Real>::template sl_poten_uKernel<Vec, rsqrt_intrin##nwtn<Vec, Real>>>(r_src, n_src, v_src, r_trg, v_trg) 851 #define PVFMM_KERNEL_MACRO \ 860 #elif defined __AVX__ 862 #elif defined __SSE3__ 873 #elif defined __AVX__ 875 #elif defined __SSE3__ 883 typedef ValueType Real;
888 #undef PVFMM_KER_NWTN 892 const ValueType OOFP = 1.0 / (4 * const_pi<ValueType>());
893 Long Nsrc = r_src.Dim() / DIM;
894 Long Ntrg = r_trg.Dim() / DIM;
895 PVFMM_ASSERT(r_src.Dim() == Nsrc * DIM);
896 PVFMM_ASSERT(n_src.Dim() == Nsrc * DIM);
897 PVFMM_ASSERT(v_src.Dim() == Nsrc * 1);
898 PVFMM_ASSERT(r_trg.Dim() == Ntrg * DIM);
899 PVFMM_ASSERT(v_trg.Dim() == Ntrg * 1);
900 for (Long i = 0; i < Ntrg; i++) {
901 ValueType trg_val = 0.0;
902 for (Long j = 0; j < Nsrc; j++) {
904 for (Integer k = 0; k < DIM; k++) {
905 dx[k] = r_trg[i * DIM + k] - r_src[j * DIM + k];
908 ValueType n_dot_r = 0;
909 for (Integer k = 0; k < DIM; k++) {
910 n_dot_r += n_src[j * DIM + k] * dx[k];
914 for (Integer k = 0; k < DIM; k++) {
917 ValueType r2inv = (r2 ? 1.0 / r2 : 0.0);
918 ValueType rinv = sqrt(r2inv);
920 trg_val += v_src[j] * n_dot_r * r2inv * rinv;
922 v_trg[i] += trg_val * OOFP;
928 Long Nsrc = r_src.Dim() / DIM;
929 Long Ntrg = r_trg.Dim() / DIM;
930 PVFMM_ASSERT(r_src.Dim() == Nsrc * DIM);
931 PVFMM_ASSERT(v_src.Dim() == Nsrc * 1);
932 PVFMM_ASSERT(r_trg.Dim() == Ntrg * DIM);
933 PVFMM_ASSERT(v_trg.Dim() == Ntrg * DIM);
934 for (Long i = 0; i < Ntrg; i++) {
935 ValueType trg_val[DIM];
936 for (Integer k = 0; k < DIM; k++) {
939 for (Long j = 0; j < Nsrc; j++) {
941 for (Integer k = 0; k < DIM; k++) {
942 dx[k] = r_trg[i * DIM + k] - r_src[j * DIM + k];
946 for (Integer k = 0; k < DIM; k++) {
949 ValueType r2inv = (r2 ? 1.0 / r2 : 0.0);
950 ValueType rinv = sqrt(r2inv);
952 ValueType v_r3inv = v_src[j] * r2inv * rinv;
953 for (Integer k = 0; k < DIM; k++) {
954 trg_val[k] += dx[k] * v_r3inv;
957 for (Integer k = 0; k < DIM; k++) {
958 v_trg[i * DIM + k] += trg_val[k];
965 Long Nsrc = r_src.Dim() / DIM;
966 Long Ntrg = r_trg.Dim() / DIM;
967 PVFMM_ASSERT(r_src.Dim() == Nsrc * DIM);
968 PVFMM_ASSERT(n_src.Dim() == Nsrc * DIM);
969 PVFMM_ASSERT(v_src.Dim() == Nsrc * 1);
970 PVFMM_ASSERT(r_trg.Dim() == Ntrg * DIM);
971 PVFMM_ASSERT(v_trg.Dim() == Ntrg * DIM);
972 for (Long i = 0; i < Ntrg; i++) {
973 ValueType trg_val[DIM];
974 for (Integer k = 0; k < DIM; k++) {
977 for (Long j = 0; j < Nsrc; j++) {
979 for (Integer k = 0; k < DIM; k++) {
980 dx[k] = r_trg[i * DIM + k] - r_src[j * DIM + k];
983 ValueType n_dot_r = 0;
984 for (Integer k = 0; k < DIM; k++) {
985 n_dot_r += n_src[j * DIM + k] * dx[k];
989 for (Integer k = 0; k < DIM; k++) {
992 ValueType r2inv = (r2 ? 1.0 / r2 : 0.0);
993 ValueType rinv = sqrt(r2inv);
995 ValueType v_nr_r5inv = v_src[j] * r2inv * r2inv * rinv * n_dot_r;
996 for (Integer k = 0; k < DIM; k++) {
997 trg_val[k] += dx[k] * v_nr_r5inv;
1000 for (Integer k = 0; k < DIM; k++) {
1001 v_trg[i * DIM + k] += trg_val[k];
1012 static const Integer DIM = 3;
1015 constexpr Integer k_dim0 = DIM;
1016 constexpr Integer k_dim1 = 1;
1017 static const ValueType scal = 1.0/(8.0*const_pi<ValueType>());
1018 static auto micro_ker = [&](ValueType* v,
const ValueType* x, ValueType invr,
const ValueType* n,
const ValueType* f) {
1019 ValueType fdotr = 0;
1020 ValueType invr2 = invr*invr;
1021 ValueType invr3 = invr2*invr;
1022 ValueType invr5 = invr2*invr3;
1023 ValueType invr7 = invr2*invr5;
1024 for(Integer k=0;k<DIM;k++) fdotr += x[k] * f[k];
1026 v[0] += (2*invr3*fdotr) * scal;
1028 return KernelFromLambda<k_dim0, k_dim1>(__FUNCTION__, micro_ker);
1032 constexpr Integer k_dim0 = DIM;
1033 constexpr Integer k_dim1 = DIM;
1034 static const ValueType scal = 1.0/(8.0*const_pi<ValueType>());
1035 static auto micro_ker = [&](ValueType* v,
const ValueType* x, ValueType invr,
const ValueType* n,
const ValueType* f) {
1036 ValueType fdotr = 0;
1037 ValueType invr2 = invr*invr;
1038 ValueType invr3 = invr2*invr;
1039 ValueType invr5 = invr2*invr3;
1040 ValueType invr7 = invr2*invr5;
1041 for(Integer k=0;k<DIM;k++) fdotr += x[k] * f[k];
1043 v[0] += (2*invr3*f[0]-6*x[0]*invr5*fdotr) * scal;
1044 v[1] += (2*invr3*f[1]-6*x[1]*invr5*fdotr) * scal;
1045 v[2] += (2*invr3*f[2]-6*x[2]*invr5*fdotr) * scal;
1047 return KernelFromLambda<k_dim0, k_dim1>(__FUNCTION__, micro_ker);
1051 constexpr Integer k_dim0 = DIM;
1052 constexpr Integer k_dim1 = DIM;
1053 static const ValueType scal = 1.0/(8.0*const_pi<ValueType>());
1054 static auto micro_ker = [&](ValueType* v,
const ValueType* x, ValueType invr,
const ValueType* n,
const ValueType* f) {
1055 ValueType fdotr = 0;
1056 ValueType invr2 = invr*invr;
1057 ValueType invr3 = invr2*invr;
1058 ValueType invr5 = invr2*invr3;
1059 ValueType invr7 = invr2*invr5;
1060 for(Integer k=0;k<DIM;k++) fdotr += x[k] * f[k];
1062 v[0] += (x[0]*invr3*fdotr+f[0]*invr) * scal;
1063 v[1] += (x[1]*invr3*fdotr+f[1]*invr) * scal;
1064 v[2] += (x[2]*invr3*fdotr+f[2]*invr) * scal;
1066 return KernelFromLambda<k_dim0, k_dim1>(__FUNCTION__, micro_ker);
1070 constexpr Integer k_dim0 = DIM;
1071 constexpr Integer k_dim1 = DIM * DIM;
1072 static const ValueType scal = 1.0/(8.0*const_pi<ValueType>());
1073 static auto micro_ker = [&](ValueType* v,
const ValueType* x, ValueType invr,
const ValueType* n,
const ValueType* f) {
1074 ValueType fdotr = 0;
1075 ValueType invr2 = invr*invr;
1076 ValueType invr3 = invr2*invr;
1077 ValueType invr5 = invr2*invr3;
1078 ValueType invr7 = invr2*invr5;
1079 for(Integer k=0;k<DIM;k++) fdotr += x[k] * f[k];
1081 v[0] += (invr3*fdotr-3*x[0]*x[0]*invr5*fdotr ) * scal;
1082 v[1] += ((-3*x[0]*x[1]*invr5*fdotr)+x[0]*invr3*f[1]-x[1]*invr3*f[0]) * scal;
1083 v[2] += ((-3*x[0]*x[2]*invr5*fdotr)+x[0]*invr3*f[2]-x[2]*invr3*f[0]) * scal;
1084 v[3] += ((-3*x[0]*x[1]*invr5*fdotr)-x[0]*invr3*f[1]+x[1]*invr3*f[0]) * scal;
1085 v[4] += (invr3*fdotr-3*x[1]*x[1]*invr5*fdotr ) * scal;
1086 v[5] += ((-3*x[1]*x[2]*invr5*fdotr)+x[1]*invr3*f[2]-x[2]*invr3*f[1]) * scal;
1087 v[6] += ((-3*x[0]*x[2]*invr5*fdotr)-x[0]*invr3*f[2]+x[2]*invr3*f[0]) * scal;
1088 v[7] += ((-3*x[1]*x[2]*invr5*fdotr)-x[1]*invr3*f[2]+x[2]*invr3*f[1]) * scal;
1089 v[8] += (invr3*fdotr-3*x[2]*x[2]*invr5*fdotr ) * scal;
1091 return KernelFromLambda<k_dim0, k_dim1>(__FUNCTION__, micro_ker);
1095 constexpr Integer k_dim0 = DIM;
1096 constexpr Integer k_dim1 = DIM * DIM * DIM;
1097 static const ValueType scal = 1.0/(8.0*const_pi<ValueType>());
1098 static auto micro_ker = [&](ValueType* v,
const ValueType* x, ValueType invr,
const ValueType* n,
const ValueType* f) {
1099 ValueType fdotr = 0;
1100 ValueType invr2 = invr*invr;
1101 ValueType invr3 = invr2*invr;
1102 ValueType invr5 = invr2*invr3;
1103 ValueType invr7 = invr2*invr5;
1104 for(Integer k=0;k<DIM;k++) fdotr += x[k] * f[k];
1106 v[ 0] += ((-9*x[0]*invr5*fdotr)+15*x[0]*x[0]*x[0]*invr7*fdotr+invr3*f[0]-3*x[0]*x[0]*invr5*f[0] ) * scal;
1107 v[ 1] += ((-3*x[1]*invr5*fdotr)+15*x[0]*x[0]*x[1]*invr7*fdotr+invr3*f[1]-3*x[0]*x[0]*invr5*f[1] ) * scal;
1108 v[ 2] += ((-3*x[2]*invr5*fdotr)+15*x[0]*x[0]*x[2]*invr7*fdotr+invr3*f[2]-3*x[0]*x[0]*invr5*f[2] ) * scal;
1109 v[ 3] += ((-3*x[1]*invr5*fdotr)+15*x[0]*x[0]*x[1]*invr7*fdotr+invr3*f[1]-3*x[0]*x[0]*invr5*f[1] ) * scal;
1110 v[ 4] += ((-3*x[0]*invr5*fdotr)+15*x[0]*x[1]*x[1]*invr7*fdotr-6*x[0]*x[1]*invr5*f[1]-invr3*f[0]+3*x[1]*x[1]*invr5*f[0]) * scal;
1111 v[ 5] += (15*x[0]*x[1]*x[2]*invr7*fdotr-3*x[0]*x[1]*invr5*f[2]-3*x[0]*x[2]*invr5*f[1]+3*x[1]*x[2]*invr5*f[0] ) * scal;
1112 v[ 6] += ((-3*x[2]*invr5*fdotr)+15*x[0]*x[0]*x[2]*invr7*fdotr+invr3*f[2]-3*x[0]*x[0]*invr5*f[2] ) * scal;
1113 v[ 7] += (15*x[0]*x[1]*x[2]*invr7*fdotr-3*x[0]*x[1]*invr5*f[2]-3*x[0]*x[2]*invr5*f[1]+3*x[1]*x[2]*invr5*f[0] ) * scal;
1114 v[ 8] += ((-3*x[0]*invr5*fdotr)+15*x[0]*x[2]*x[2]*invr7*fdotr-6*x[0]*x[2]*invr5*f[2]-invr3*f[0]+3*x[2]*x[2]*invr5*f[0]) * scal;
1115 v[ 9] += ((-3*x[1]*invr5*fdotr)+15*x[0]*x[0]*x[1]*invr7*fdotr-invr3*f[1]+3*x[0]*x[0]*invr5*f[1]-6*x[0]*x[1]*invr5*f[0]) * scal;
1116 v[10] += ((-3*x[0]*invr5*fdotr)+15*x[0]*x[1]*x[1]*invr7*fdotr+invr3*f[0]-3*x[1]*x[1]*invr5*f[0] ) * scal;
1117 v[11] += (15*x[0]*x[1]*x[2]*invr7*fdotr-3*x[0]*x[1]*invr5*f[2]+3*x[0]*x[2]*invr5*f[1]-3*x[1]*x[2]*invr5*f[0] ) * scal;
1118 v[12] += ((-3*x[0]*invr5*fdotr)+15*x[0]*x[1]*x[1]*invr7*fdotr+invr3*f[0]-3*x[1]*x[1]*invr5*f[0] ) * scal;
1119 v[13] += ((-9*x[1]*invr5*fdotr)+15*x[1]*x[1]*x[1]*invr7*fdotr+invr3*f[1]-3*x[1]*x[1]*invr5*f[1] ) * scal;
1120 v[14] += ((-3*x[2]*invr5*fdotr)+15*x[1]*x[1]*x[2]*invr7*fdotr+invr3*f[2]-3*x[1]*x[1]*invr5*f[2] ) * scal;
1121 v[15] += (15*x[0]*x[1]*x[2]*invr7*fdotr-3*x[0]*x[1]*invr5*f[2]+3*x[0]*x[2]*invr5*f[1]-3*x[1]*x[2]*invr5*f[0] ) * scal;
1122 v[16] += ((-3*x[2]*invr5*fdotr)+15*x[1]*x[1]*x[2]*invr7*fdotr+invr3*f[2]-3*x[1]*x[1]*invr5*f[2] ) * scal;
1123 v[17] += ((-3*x[1]*invr5*fdotr)+15*x[1]*x[2]*x[2]*invr7*fdotr-6*x[1]*x[2]*invr5*f[2]-invr3*f[1]+3*x[2]*x[2]*invr5*f[1]) * scal;
1124 v[18] += ((-3*x[2]*invr5*fdotr)+15*x[0]*x[0]*x[2]*invr7*fdotr-invr3*f[2]+3*x[0]*x[0]*invr5*f[2]-6*x[0]*x[2]*invr5*f[0]) * scal;
1125 v[19] += (15*x[0]*x[1]*x[2]*invr7*fdotr+3*x[0]*x[1]*invr5*f[2]-3*x[0]*x[2]*invr5*f[1]-3*x[1]*x[2]*invr5*f[0] ) * scal;
1126 v[20] += ((-3*x[0]*invr5*fdotr)+15*x[0]*x[2]*x[2]*invr7*fdotr+invr3*f[0]-3*x[2]*x[2]*invr5*f[0] ) * scal;
1127 v[21] += (15*x[0]*x[1]*x[2]*invr7*fdotr+3*x[0]*x[1]*invr5*f[2]-3*x[0]*x[2]*invr5*f[1]-3*x[1]*x[2]*invr5*f[0] ) * scal;
1128 v[22] += ((-3*x[2]*invr5*fdotr)+15*x[1]*x[1]*x[2]*invr7*fdotr-invr3*f[2]+3*x[1]*x[1]*invr5*f[2]-6*x[1]*x[2]*invr5*f[1]) * scal;
1129 v[23] += ((-3*x[1]*invr5*fdotr)+15*x[1]*x[2]*x[2]*invr7*fdotr+invr3*f[1]-3*x[2]*x[2]*invr5*f[1] ) * scal;
1130 v[24] += ((-3*x[0]*invr5*fdotr)+15*x[0]*x[2]*x[2]*invr7*fdotr+invr3*f[0]-3*x[2]*x[2]*invr5*f[0] ) * scal;
1131 v[25] += ((-3*x[1]*invr5*fdotr)+15*x[1]*x[2]*x[2]*invr7*fdotr+invr3*f[1]-3*x[2]*x[2]*invr5*f[1] ) * scal;
1132 v[26] += ((-9*x[2]*invr5*fdotr)+15*x[2]*x[2]*x[2]*invr7*fdotr+invr3*f[2]-3*x[2]*x[2]*invr5*f[2] ) * scal;
1134 return KernelFromLambda<k_dim0, k_dim1>(__FUNCTION__, micro_ker);
1138 constexpr Integer k_dim0 = DIM;
1139 constexpr Integer k_dim1 = DIM+1;
1140 static const ValueType scal = 1.0/(8.0*const_pi<ValueType>());
1141 static auto micro_ker = [&](ValueType* v,
const ValueType* x, ValueType invr,
const ValueType* n,
const ValueType* f) {
1142 ValueType fdotr = 0;
1143 ValueType invr2 = invr*invr;
1144 ValueType invr3 = invr2*invr;
1145 ValueType invr5 = invr2*invr3;
1146 ValueType invr7 = invr2*invr5;
1147 for(Integer k=0;k<DIM;k++) fdotr += x[k] * f[k];
1149 v[0] += (2*invr3*fdotr ) * scal;
1150 v[1] += (x[0]*invr3*fdotr+f[0]*invr) * scal;
1151 v[2] += (x[1]*invr3*fdotr+f[1]*invr) * scal;
1152 v[3] += (x[2]*invr3*fdotr+f[2]*invr) * scal;
1154 return KernelFromLambda<k_dim0, k_dim1>(__FUNCTION__, micro_ker);
1160 constexpr Integer k_dim0 = DIM+1;
1161 constexpr Integer k_dim1 = 1;
1162 static const ValueType scal = 1.0/(8.0*const_pi<ValueType>());
1163 static auto micro_ker = [&](ValueType* v,
const ValueType* x, ValueType invr,
const ValueType* n,
const ValueType* f) {
1164 ValueType fdotr = 0;
1165 ValueType invr2 = invr*invr;
1166 ValueType invr3 = invr2*invr;
1167 ValueType invr5 = invr2*invr3;
1168 ValueType invr7 = invr2*invr5;
1169 for(Integer k=0;k<DIM;k++) fdotr += x[k] * f[k];
1171 v[0] += (2*invr3*fdotr) * scal;
1173 return KernelFromLambda<k_dim0, k_dim1>(__FUNCTION__, micro_ker);
1177 constexpr Integer k_dim0 = DIM+1;
1178 constexpr Integer k_dim1 = DIM;
1179 static const ValueType scal = 1.0/(8.0*const_pi<ValueType>());
1180 static auto micro_ker = [&](ValueType* v,
const ValueType* x, ValueType invr,
const ValueType* n,
const ValueType* f) {
1181 ValueType fdotr = 0;
1182 ValueType invr2 = invr*invr;
1183 ValueType invr3 = invr2*invr;
1184 ValueType invr5 = invr2*invr3;
1185 ValueType invr7 = invr2*invr5;
1186 for(Integer k=0;k<DIM;k++) fdotr += x[k] * f[k];
1188 v[0] += (2*invr3*f[0]-6*x[0]*invr5*fdotr) * scal;
1189 v[1] += (2*invr3*f[1]-6*x[1]*invr5*fdotr) * scal;
1190 v[2] += (2*invr3*f[2]-6*x[2]*invr5*fdotr) * scal;
1192 return KernelFromLambda<k_dim0, k_dim1>(__FUNCTION__, micro_ker);
1196 constexpr Integer k_dim0 = DIM+1;
1197 constexpr Integer k_dim1 = DIM;
1198 static const ValueType scal = 1.0/(8.0*const_pi<ValueType>());
1199 static auto micro_ker = [&](ValueType* v,
const ValueType* x, ValueType invr,
const ValueType* n,
const ValueType* f) {
1200 ValueType fdotr = 0;
1201 ValueType invr2 = invr*invr;
1202 ValueType invr3 = invr2*invr;
1203 ValueType invr5 = invr2*invr3;
1204 ValueType invr7 = invr2*invr5;
1205 for(Integer k=0;k<DIM;k++) fdotr += x[k] * f[k];
1207 v[0] += (x[0]*invr3*f[3]+x[0]*invr3*fdotr+f[0]*invr) * scal;
1208 v[1] += (x[1]*invr3*f[3]+x[1]*invr3*fdotr+f[1]*invr) * scal;
1209 v[2] += (x[2]*invr3*f[3]+x[2]*invr3*fdotr+f[2]*invr) * scal;
1211 return KernelFromLambda<k_dim0, k_dim1>(__FUNCTION__, micro_ker);
1215 constexpr Integer k_dim0 = DIM+1;
1216 constexpr Integer k_dim1 = DIM * DIM;
1217 static const ValueType scal = 1.0/(8.0*const_pi<ValueType>());
1218 static auto micro_ker = [&](ValueType* v,
const ValueType* x, ValueType invr,
const ValueType* n,
const ValueType* f) {
1219 ValueType fdotr = 0;
1220 ValueType invr2 = invr*invr;
1221 ValueType invr3 = invr2*invr;
1222 ValueType invr5 = invr2*invr3;
1223 ValueType invr7 = invr2*invr5;
1224 for(Integer k=0;k<DIM;k++) fdotr += x[k] * f[k];
1226 v[0] += (invr3*f[3]-3*x[0]*x[0]*invr5*f[3]+invr3*fdotr-3*x[0]*x[0]*invr5*fdotr ) * scal;
1227 v[1] += ((-3*x[0]*x[1]*invr5*f[3])-3*x[0]*x[1]*invr5*fdotr+x[0]*invr3*f[1]-x[1]*invr3*f[0]) * scal;
1228 v[2] += ((-3*x[0]*x[2]*invr5*f[3])-3*x[0]*x[2]*invr5*fdotr+x[0]*invr3*f[2]-x[2]*invr3*f[0]) * scal;
1229 v[3] += ((-3*x[0]*x[1]*invr5*f[3])-3*x[0]*x[1]*invr5*fdotr-x[0]*invr3*f[1]+x[1]*invr3*f[0]) * scal;
1230 v[4] += (invr3*f[3]-3*x[1]*x[1]*invr5*f[3]+invr3*fdotr-3*x[1]*x[1]*invr5*fdotr ) * scal;
1231 v[5] += ((-3*x[1]*x[2]*invr5*f[3])-3*x[1]*x[2]*invr5*fdotr+x[1]*invr3*f[2]-x[2]*invr3*f[1]) * scal;
1232 v[6] += ((-3*x[0]*x[2]*invr5*f[3])-3*x[0]*x[2]*invr5*fdotr-x[0]*invr3*f[2]+x[2]*invr3*f[0]) * scal;
1233 v[7] += ((-3*x[1]*x[2]*invr5*f[3])-3*x[1]*x[2]*invr5*fdotr-x[1]*invr3*f[2]+x[2]*invr3*f[1]) * scal;
1234 v[8] += (invr3*f[3]-3*x[2]*x[2]*invr5*f[3]+invr3*fdotr-3*x[2]*x[2]*invr5*fdotr ) * scal;
1236 return KernelFromLambda<k_dim0, k_dim1>(__FUNCTION__, micro_ker);
1240 constexpr Integer k_dim0 = DIM+1;
1241 constexpr Integer k_dim1 = DIM * DIM * DIM;
1242 static const ValueType scal = 1.0/(8.0*const_pi<ValueType>());
1243 static auto micro_ker = [&](ValueType* v,
const ValueType* x, ValueType invr,
const ValueType* n,
const ValueType* f) {
1244 ValueType fdotr = 0;
1245 ValueType invr2 = invr*invr;
1246 ValueType invr3 = invr2*invr;
1247 ValueType invr5 = invr2*invr3;
1248 ValueType invr7 = invr2*invr5;
1249 for(Integer k=0;k<DIM;k++) fdotr += x[k] * f[k];
1251 v[ 0] += ((-9*x[0]*invr5*f[3])+15*x[0]*x[0]*x[0]*invr7*f[3]-9*x[0]*invr5*fdotr+15*x[0]*x[0]*x[0]*invr7*fdotr+invr3*f[0]-3*x[0]*x[0]*invr5*f[0] ) * scal;
1252 v[ 1] += ((-3*x[1]*invr5*f[3])+15*x[0]*x[0]*x[1]*invr7*f[3]-3*x[1]*invr5*fdotr+15*x[0]*x[0]*x[1]*invr7*fdotr+invr3*f[1]-3*x[0]*x[0]*invr5*f[1] ) * scal;
1253 v[ 2] += ((-3*x[2]*invr5*f[3])+15*x[0]*x[0]*x[2]*invr7*f[3]-3*x[2]*invr5*fdotr+15*x[0]*x[0]*x[2]*invr7*fdotr+invr3*f[2]-3*x[0]*x[0]*invr5*f[2] ) * scal;
1254 v[ 3] += ((-3*x[1]*invr5*f[3])+15*x[0]*x[0]*x[1]*invr7*f[3]-3*x[1]*invr5*fdotr+15*x[0]*x[0]*x[1]*invr7*fdotr+invr3*f[1]-3*x[0]*x[0]*invr5*f[1] ) * scal;
1255 v[ 4] += ((-3*x[0]*invr5*f[3])+15*x[0]*x[1]*x[1]*invr7*f[3]-3*x[0]*invr5*fdotr+15*x[0]*x[1]*x[1]*invr7*fdotr-6*x[0]*x[1]*invr5*f[1]-invr3*f[0]+3*x[1]*x[1]*invr5*f[0]) * scal;
1256 v[ 5] += (15*x[0]*x[1]*x[2]*invr7*f[3]+15*x[0]*x[1]*x[2]*invr7*fdotr-3*x[0]*x[1]*invr5*f[2]-3*x[0]*x[2]*invr5*f[1]+3*x[1]*x[2]*invr5*f[0] ) * scal;
1257 v[ 6] += ((-3*x[2]*invr5*f[3])+15*x[0]*x[0]*x[2]*invr7*f[3]-3*x[2]*invr5*fdotr+15*x[0]*x[0]*x[2]*invr7*fdotr+invr3*f[2]-3*x[0]*x[0]*invr5*f[2] ) * scal;
1258 v[ 7] += (15*x[0]*x[1]*x[2]*invr7*f[3]+15*x[0]*x[1]*x[2]*invr7*fdotr-3*x[0]*x[1]*invr5*f[2]-3*x[0]*x[2]*invr5*f[1]+3*x[1]*x[2]*invr5*f[0] ) * scal;
1259 v[ 8] += ((-3*x[0]*invr5*f[3])+15*x[0]*x[2]*x[2]*invr7*f[3]-3*x[0]*invr5*fdotr+15*x[0]*x[2]*x[2]*invr7*fdotr-6*x[0]*x[2]*invr5*f[2]-invr3*f[0]+3*x[2]*x[2]*invr5*f[0]) * scal;
1260 v[ 9] += ((-3*x[1]*invr5*f[3])+15*x[0]*x[0]*x[1]*invr7*f[3]-3*x[1]*invr5*fdotr+15*x[0]*x[0]*x[1]*invr7*fdotr-invr3*f[1]+3*x[0]*x[0]*invr5*f[1]-6*x[0]*x[1]*invr5*f[0]) * scal;
1261 v[10] += ((-3*x[0]*invr5*f[3])+15*x[0]*x[1]*x[1]*invr7*f[3]-3*x[0]*invr5*fdotr+15*x[0]*x[1]*x[1]*invr7*fdotr+invr3*f[0]-3*x[1]*x[1]*invr5*f[0] ) * scal;
1262 v[11] += (15*x[0]*x[1]*x[2]*invr7*f[3]+15*x[0]*x[1]*x[2]*invr7*fdotr-3*x[0]*x[1]*invr5*f[2]+3*x[0]*x[2]*invr5*f[1]-3*x[1]*x[2]*invr5*f[0] ) * scal;
1263 v[12] += ((-3*x[0]*invr5*f[3])+15*x[0]*x[1]*x[1]*invr7*f[3]-3*x[0]*invr5*fdotr+15*x[0]*x[1]*x[1]*invr7*fdotr+invr3*f[0]-3*x[1]*x[1]*invr5*f[0] ) * scal;
1264 v[13] += ((-9*x[1]*invr5*f[3])+15*x[1]*x[1]*x[1]*invr7*f[3]-9*x[1]*invr5*fdotr+15*x[1]*x[1]*x[1]*invr7*fdotr+invr3*f[1]-3*x[1]*x[1]*invr5*f[1] ) * scal;
1265 v[14] += ((-3*x[2]*invr5*f[3])+15*x[1]*x[1]*x[2]*invr7*f[3]-3*x[2]*invr5*fdotr+15*x[1]*x[1]*x[2]*invr7*fdotr+invr3*f[2]-3*x[1]*x[1]*invr5*f[2] ) * scal;
1266 v[15] += (15*x[0]*x[1]*x[2]*invr7*f[3]+15*x[0]*x[1]*x[2]*invr7*fdotr-3*x[0]*x[1]*invr5*f[2]+3*x[0]*x[2]*invr5*f[1]-3*x[1]*x[2]*invr5*f[0] ) * scal;
1267 v[16] += ((-3*x[2]*invr5*f[3])+15*x[1]*x[1]*x[2]*invr7*f[3]-3*x[2]*invr5*fdotr+15*x[1]*x[1]*x[2]*invr7*fdotr+invr3*f[2]-3*x[1]*x[1]*invr5*f[2] ) * scal;
1268 v[17] += ((-3*x[1]*invr5*f[3])+15*x[1]*x[2]*x[2]*invr7*f[3]-3*x[1]*invr5*fdotr+15*x[1]*x[2]*x[2]*invr7*fdotr-6*x[1]*x[2]*invr5*f[2]-invr3*f[1]+3*x[2]*x[2]*invr5*f[1]) * scal;
1269 v[18] += ((-3*x[2]*invr5*f[3])+15*x[0]*x[0]*x[2]*invr7*f[3]-3*x[2]*invr5*fdotr+15*x[0]*x[0]*x[2]*invr7*fdotr-invr3*f[2]+3*x[0]*x[0]*invr5*f[2]-6*x[0]*x[2]*invr5*f[0]) * scal;
1270 v[19] += (15*x[0]*x[1]*x[2]*invr7*f[3]+15*x[0]*x[1]*x[2]*invr7*fdotr+3*x[0]*x[1]*invr5*f[2]-3*x[0]*x[2]*invr5*f[1]-3*x[1]*x[2]*invr5*f[0] ) * scal;
1271 v[20] += ((-3*x[0]*invr5*f[3])+15*x[0]*x[2]*x[2]*invr7*f[3]-3*x[0]*invr5*fdotr+15*x[0]*x[2]*x[2]*invr7*fdotr+invr3*f[0]-3*x[2]*x[2]*invr5*f[0] ) * scal;
1272 v[21] += (15*x[0]*x[1]*x[2]*invr7*f[3]+15*x[0]*x[1]*x[2]*invr7*fdotr+3*x[0]*x[1]*invr5*f[2]-3*x[0]*x[2]*invr5*f[1]-3*x[1]*x[2]*invr5*f[0] ) * scal;
1273 v[22] += ((-3*x[2]*invr5*f[3])+15*x[1]*x[1]*x[2]*invr7*f[3]-3*x[2]*invr5*fdotr+15*x[1]*x[1]*x[2]*invr7*fdotr-invr3*f[2]+3*x[1]*x[1]*invr5*f[2]-6*x[1]*x[2]*invr5*f[1]) * scal;
1274 v[23] += ((-3*x[1]*invr5*f[3])+15*x[1]*x[2]*x[2]*invr7*f[3]-3*x[1]*invr5*fdotr+15*x[1]*x[2]*x[2]*invr7*fdotr+invr3*f[1]-3*x[2]*x[2]*invr5*f[1] ) * scal;
1275 v[24] += ((-3*x[0]*invr5*f[3])+15*x[0]*x[2]*x[2]*invr7*f[3]-3*x[0]*invr5*fdotr+15*x[0]*x[2]*x[2]*invr7*fdotr+invr3*f[0]-3*x[2]*x[2]*invr5*f[0] ) * scal;
1276 v[25] += ((-3*x[1]*invr5*f[3])+15*x[1]*x[2]*x[2]*invr7*f[3]-3*x[1]*invr5*fdotr+15*x[1]*x[2]*x[2]*invr7*fdotr+invr3*f[1]-3*x[2]*x[2]*invr5*f[1] ) * scal;
1277 v[26] += ((-9*x[2]*invr5*f[3])+15*x[2]*x[2]*x[2]*invr7*f[3]-9*x[2]*invr5*fdotr+15*x[2]*x[2]*x[2]*invr7*fdotr+invr3*f[2]-3*x[2]*x[2]*invr5*f[2] ) * scal;
1279 return KernelFromLambda<k_dim0, k_dim1>(__FUNCTION__, micro_ker);
1283 constexpr Integer k_dim0 = DIM+1;
1284 constexpr Integer k_dim1 = DIM+1;
1285 static const ValueType scal = 1.0/(8.0*const_pi<ValueType>());
1286 static auto micro_ker = [&](ValueType* v,
const ValueType* x, ValueType invr,
const ValueType* n,
const ValueType* f) {
1287 ValueType fdotr = 0;
1288 ValueType invr2 = invr*invr;
1289 ValueType invr3 = invr2*invr;
1290 ValueType invr5 = invr2*invr3;
1291 ValueType invr7 = invr2*invr5;
1292 for(Integer k=0;k<DIM;k++) fdotr += x[k] * f[k];
1294 v[0] += (2*invr3*fdotr ) * scal;
1295 v[1] += (x[0]*invr3*f[3]+x[0]*invr3*fdotr+f[0]*invr) * scal;
1296 v[2] += (x[1]*invr3*f[3]+x[1]*invr3*fdotr+f[1]*invr) * scal;
1297 v[3] += (x[2]*invr3*f[3]+x[2]*invr3*fdotr+f[2]*invr) * scal;
1299 return KernelFromLambda<k_dim0, k_dim1>(__FUNCTION__, micro_ker);
1305 constexpr Integer k_dim0 = DIM;
1306 constexpr Integer k_dim1 = 1;
1307 static const ValueType scal = 1.0/(8.0*const_pi<ValueType>());
1308 static auto micro_ker = [&](ValueType* v,
const ValueType* x, ValueType invr,
const ValueType* n,
const ValueType* f) {
1309 ValueType fdotr = 0;
1310 ValueType ndotr = 0;
1311 ValueType ndotf = 0;
1312 ValueType invr2 = invr*invr;
1313 ValueType invr3 = invr2*invr;
1314 ValueType invr5 = invr2*invr3;
1315 ValueType invr7 = invr2*invr5;
1316 for(Integer k=0;k<DIM;k++) fdotr += f[k] * x[k];
1317 for(Integer k=0;k<DIM;k++) ndotr += n[k] * x[k];
1318 for(Integer k=0;k<DIM;k++) ndotf += n[k] * f[k];
1320 v[0] += 4*(invr3*(-ndotf)+3*invr5*fdotr*ndotr) * scal;
1322 return KernelFromLambda<k_dim0, k_dim1>(__FUNCTION__, micro_ker);
1326 constexpr Integer k_dim0 = DIM;
1327 constexpr Integer k_dim1 = DIM;
1328 static const ValueType scal = 1.0/(8.0*const_pi<ValueType>());
1329 static auto micro_ker = [&](ValueType* v,
const ValueType* x, ValueType invr,
const ValueType* n,
const ValueType* f) {
1330 ValueType fdotr = 0;
1331 ValueType ndotr = 0;
1332 ValueType ndotf = 0;
1333 ValueType invr2 = invr*invr;
1334 ValueType invr3 = invr2*invr;
1335 ValueType invr5 = invr2*invr3;
1336 ValueType invr7 = invr2*invr5;
1337 for(Integer k=0;k<DIM;k++) fdotr += f[k] * x[k];
1338 for(Integer k=0;k<DIM;k++) ndotr += n[k] * x[k];
1339 for(Integer k=0;k<DIM;k++) ndotf += n[k] * f[k];
1341 v[0] += 4*((-3*x[0]*invr5*(-ndotf))-15*x[0]*invr7*fdotr*ndotr+3*invr5*f[0]*ndotr+3*invr5*fdotr*n[0]) * scal;
1342 v[1] += 4*((-3*x[1]*invr5*(-ndotf))-15*x[1]*invr7*fdotr*ndotr+3*invr5*f[1]*ndotr+3*invr5*fdotr*n[1]) * scal;
1343 v[2] += 4*((-3*x[2]*invr5*(-ndotf))-15*x[2]*invr7*fdotr*ndotr+3*invr5*f[2]*ndotr+3*invr5*fdotr*n[2]) * scal;
1345 return KernelFromLambda<k_dim0, k_dim1>(__FUNCTION__, micro_ker);
1349 constexpr Integer k_dim0 = DIM;
1350 constexpr Integer k_dim1 = DIM;
1351 static const ValueType scal = 1.0/(8.0*const_pi<ValueType>());
1352 static auto micro_ker = [&](ValueType* v,
const ValueType* x, ValueType invr,
const ValueType* n,
const ValueType* f) {
1353 ValueType fdotr = 0;
1354 ValueType ndotr = 0;
1355 ValueType ndotf = 0;
1356 ValueType invr2 = invr*invr;
1357 ValueType invr3 = invr2*invr;
1358 ValueType invr5 = invr2*invr3;
1359 ValueType invr7 = invr2*invr5;
1360 for(Integer k=0;k<DIM;k++) fdotr += f[k] * x[k];
1361 for(Integer k=0;k<DIM;k++) ndotr += n[k] * x[k];
1362 for(Integer k=0;k<DIM;k++) ndotf += n[k] * f[k];
1364 v[0] += (6*x[0]*invr5*fdotr*ndotr) * scal;
1365 v[1] += (6*x[1]*invr5*fdotr*ndotr) * scal;
1366 v[2] += (6*x[2]*invr5*fdotr*ndotr) * scal;
1368 return KernelFromLambda<k_dim0, k_dim1>(__FUNCTION__, micro_ker);
1372 constexpr Integer k_dim0 = DIM;
1373 constexpr Integer k_dim1 = DIM * DIM;
1374 static const ValueType scal = 1.0/(8.0*const_pi<ValueType>());
1375 static auto micro_ker = [&](ValueType* v,
const ValueType* x, ValueType invr,
const ValueType* n,
const ValueType* f) {
1376 ValueType fdotr = 0;
1377 ValueType ndotr = 0;
1378 ValueType ndotf = 0;
1379 ValueType invr2 = invr*invr;
1380 ValueType invr3 = invr2*invr;
1381 ValueType invr5 = invr2*invr3;
1382 ValueType invr7 = invr2*invr5;
1383 for(Integer k=0;k<DIM;k++) fdotr += f[k] * x[k];
1384 for(Integer k=0;k<DIM;k++) ndotr += n[k] * x[k];
1385 for(Integer k=0;k<DIM;k++) ndotf += n[k] * f[k];
1387 v[0] += (6*invr5*fdotr*ndotr-30*x[0]*x[0]*invr7*fdotr*ndotr+6*x[0]*invr5*f[0]*ndotr+6*x[0]*invr5*fdotr*n[0]) * scal;
1388 v[1] += ((-30*x[0]*x[1]*invr7*fdotr*ndotr)+6*x[0]*invr5*f[1]*ndotr+6*x[0]*invr5*fdotr*n[1] ) * scal;
1389 v[2] += ((-30*x[0]*x[2]*invr7*fdotr*ndotr)+6*x[0]*invr5*f[2]*ndotr+6*x[0]*invr5*fdotr*n[2] ) * scal;
1390 v[3] += ((-30*x[0]*x[1]*invr7*fdotr*ndotr)+6*x[1]*invr5*f[0]*ndotr+6*x[1]*invr5*fdotr*n[0] ) * scal;
1391 v[4] += (6*invr5*fdotr*ndotr-30*x[1]*x[1]*invr7*fdotr*ndotr+6*x[1]*invr5*f[1]*ndotr+6*x[1]*invr5*fdotr*n[1]) * scal;
1392 v[5] += ((-30*x[1]*x[2]*invr7*fdotr*ndotr)+6*x[1]*invr5*f[2]*ndotr+6*x[1]*invr5*fdotr*n[2] ) * scal;
1393 v[6] += ((-30*x[0]*x[2]*invr7*fdotr*ndotr)+6*x[2]*invr5*f[0]*ndotr+6*x[2]*invr5*fdotr*n[0] ) * scal;
1394 v[7] += ((-30*x[1]*x[2]*invr7*fdotr*ndotr)+6*x[2]*invr5*f[1]*ndotr+6*x[2]*invr5*fdotr*n[1] ) * scal;
1395 v[8] += (6*invr5*fdotr*ndotr-30*x[2]*x[2]*invr7*fdotr*ndotr+6*x[2]*invr5*f[2]*ndotr+6*x[2]*invr5*fdotr*n[2]) * scal;
1397 return KernelFromLambda<k_dim0, k_dim1>(__FUNCTION__, micro_ker);
1401 constexpr Integer k_dim0 = DIM;
1402 constexpr Integer k_dim1 = DIM * DIM * DIM;
1403 static const ValueType scal = 1.0/(8.0*const_pi<ValueType>());
1404 static auto micro_ker = [&](ValueType* v,
const ValueType* x, ValueType invr,
const ValueType* n,
const ValueType* f) {
1405 ValueType fdotr = 0;
1406 ValueType ndotr = 0;
1407 ValueType ndotf = 0;
1408 ValueType invr2 = invr*invr;
1409 ValueType invr3 = invr2*invr;
1410 ValueType invr5 = invr2*invr3;
1411 ValueType invr7 = invr2*invr5;
1412 ValueType invr9 = invr2*invr7;
1413 for(Integer k=0;k<DIM;k++) fdotr += f[k] * x[k];
1414 for(Integer k=0;k<DIM;k++) ndotr += n[k] * x[k];
1415 for(Integer k=0;k<DIM;k++) ndotf += n[k] * f[k];
1417 v[ 0] += ((-90*x[0]*invr7*fdotr*ndotr)+210*x[0]*x[0]*x[0]*invr9*fdotr*ndotr+12*invr5*f[0]*ndotr-60*x[0]*x[0]*invr7*f[0]*ndotr+12*invr5*fdotr*n[0]-60*x[0]*x[0]*invr7*fdotr*n[0]+12*x[0]*invr5*f[0]*n[0] ) * scal;
1418 v[ 1] += ((-30*x[1]*invr7*fdotr*ndotr)+210*x[0]*x[0]*x[1]*invr9*fdotr*ndotr+6*invr5*f[1]*ndotr-30*x[0]*x[0]*invr7*f[1]*ndotr-30*x[0]*x[1]*invr7*f[0]*ndotr+6*invr5*fdotr*n[1]-30*x[0]*x[0]*invr7*fdotr*n[1]+6*x[0]*invr5*f[0]*n[1]-30*x[0]*x[1]*invr7*fdotr*n[0]+6*x[0]*invr5*f[1]*n[0]) * scal;
1419 v[ 2] += ((-30*x[2]*invr7*fdotr*ndotr)+210*x[0]*x[0]*x[2]*invr9*fdotr*ndotr+6*invr5*f[2]*ndotr-30*x[0]*x[0]*invr7*f[2]*ndotr-30*x[0]*x[2]*invr7*f[0]*ndotr+6*invr5*fdotr*n[2]-30*x[0]*x[0]*invr7*fdotr*n[2]+6*x[0]*invr5*f[0]*n[2]-30*x[0]*x[2]*invr7*fdotr*n[0]+6*x[0]*invr5*f[2]*n[0]) * scal;
1420 v[ 3] += ((-30*x[1]*invr7*fdotr*ndotr)+210*x[0]*x[0]*x[1]*invr9*fdotr*ndotr+6*invr5*f[1]*ndotr-30*x[0]*x[0]*invr7*f[1]*ndotr-30*x[0]*x[1]*invr7*f[0]*ndotr+6*invr5*fdotr*n[1]-30*x[0]*x[0]*invr7*fdotr*n[1]+6*x[0]*invr5*f[0]*n[1]-30*x[0]*x[1]*invr7*fdotr*n[0]+6*x[0]*invr5*f[1]*n[0]) * scal;
1421 v[ 4] += ((-30*x[0]*invr7*fdotr*ndotr)+210*x[0]*x[1]*x[1]*invr9*fdotr*ndotr-60*x[0]*x[1]*invr7*f[1]*ndotr-60*x[0]*x[1]*invr7*fdotr*n[1]+12*x[0]*invr5*f[1]*n[1] ) * scal;
1422 v[ 5] += (210*x[0]*x[1]*x[2]*invr9*fdotr*ndotr-30*x[0]*x[1]*invr7*f[2]*ndotr-30*x[0]*x[2]*invr7*f[1]*ndotr-30*x[0]*x[1]*invr7*fdotr*n[2]+6*x[0]*invr5*f[1]*n[2]-30*x[0]*x[2]*invr7*fdotr*n[1]+6*x[0]*invr5*f[2]*n[1] ) * scal;
1423 v[ 6] += ((-30*x[2]*invr7*fdotr*ndotr)+210*x[0]*x[0]*x[2]*invr9*fdotr*ndotr+6*invr5*f[2]*ndotr-30*x[0]*x[0]*invr7*f[2]*ndotr-30*x[0]*x[2]*invr7*f[0]*ndotr+6*invr5*fdotr*n[2]-30*x[0]*x[0]*invr7*fdotr*n[2]+6*x[0]*invr5*f[0]*n[2]-30*x[0]*x[2]*invr7*fdotr*n[0]+6*x[0]*invr5*f[2]*n[0]) * scal;
1424 v[ 7] += (210*x[0]*x[1]*x[2]*invr9*fdotr*ndotr-30*x[0]*x[1]*invr7*f[2]*ndotr-30*x[0]*x[2]*invr7*f[1]*ndotr-30*x[0]*x[1]*invr7*fdotr*n[2]+6*x[0]*invr5*f[1]*n[2]-30*x[0]*x[2]*invr7*fdotr*n[1]+6*x[0]*invr5*f[2]*n[1] ) * scal;
1425 v[ 8] += ((-30*x[0]*invr7*fdotr*ndotr)+210*x[0]*x[2]*x[2]*invr9*fdotr*ndotr-60*x[0]*x[2]*invr7*f[2]*ndotr-60*x[0]*x[2]*invr7*fdotr*n[2]+12*x[0]*invr5*f[2]*n[2] ) * scal;
1426 v[ 9] += ((-30*x[1]*invr7*fdotr*ndotr)+210*x[0]*x[0]*x[1]*invr9*fdotr*ndotr-60*x[0]*x[1]*invr7*f[0]*ndotr-60*x[0]*x[1]*invr7*fdotr*n[0]+12*x[1]*invr5*f[0]*n[0] ) * scal;
1427 v[10] += ((-30*x[0]*invr7*fdotr*ndotr)+210*x[0]*x[1]*x[1]*invr9*fdotr*ndotr-30*x[0]*x[1]*invr7*f[1]*ndotr+6*invr5*f[0]*ndotr-30*x[1]*x[1]*invr7*f[0]*ndotr-30*x[0]*x[1]*invr7*fdotr*n[1]+6*x[1]*invr5*f[0]*n[1]+6*invr5*fdotr*n[0]-30*x[1]*x[1]*invr7*fdotr*n[0]+6*x[1]*invr5*f[1]*n[0]) * scal;
1428 v[11] += (210*x[0]*x[1]*x[2]*invr9*fdotr*ndotr-30*x[0]*x[1]*invr7*f[2]*ndotr-30*x[1]*x[2]*invr7*f[0]*ndotr-30*x[0]*x[1]*invr7*fdotr*n[2]+6*x[1]*invr5*f[0]*n[2]-30*x[1]*x[2]*invr7*fdotr*n[0]+6*x[1]*invr5*f[2]*n[0] ) * scal;
1429 v[12] += ((-30*x[0]*invr7*fdotr*ndotr)+210*x[0]*x[1]*x[1]*invr9*fdotr*ndotr-30*x[0]*x[1]*invr7*f[1]*ndotr+6*invr5*f[0]*ndotr-30*x[1]*x[1]*invr7*f[0]*ndotr-30*x[0]*x[1]*invr7*fdotr*n[1]+6*x[1]*invr5*f[0]*n[1]+6*invr5*fdotr*n[0]-30*x[1]*x[1]*invr7*fdotr*n[0]+6*x[1]*invr5*f[1]*n[0]) * scal;
1430 v[13] += ((-90*x[1]*invr7*fdotr*ndotr)+210*x[1]*x[1]*x[1]*invr9*fdotr*ndotr+12*invr5*f[1]*ndotr-60*x[1]*x[1]*invr7*f[1]*ndotr+12*invr5*fdotr*n[1]-60*x[1]*x[1]*invr7*fdotr*n[1]+12*x[1]*invr5*f[1]*n[1] ) * scal;
1431 v[14] += ((-30*x[2]*invr7*fdotr*ndotr)+210*x[1]*x[1]*x[2]*invr9*fdotr*ndotr+6*invr5*f[2]*ndotr-30*x[1]*x[1]*invr7*f[2]*ndotr-30*x[1]*x[2]*invr7*f[1]*ndotr+6*invr5*fdotr*n[2]-30*x[1]*x[1]*invr7*fdotr*n[2]+6*x[1]*invr5*f[1]*n[2]-30*x[1]*x[2]*invr7*fdotr*n[1]+6*x[1]*invr5*f[2]*n[1]) * scal;
1432 v[15] += (210*x[0]*x[1]*x[2]*invr9*fdotr*ndotr-30*x[0]*x[1]*invr7*f[2]*ndotr-30*x[1]*x[2]*invr7*f[0]*ndotr-30*x[0]*x[1]*invr7*fdotr*n[2]+6*x[1]*invr5*f[0]*n[2]-30*x[1]*x[2]*invr7*fdotr*n[0]+6*x[1]*invr5*f[2]*n[0] ) * scal;
1433 v[16] += ((-30*x[2]*invr7*fdotr*ndotr)+210*x[1]*x[1]*x[2]*invr9*fdotr*ndotr+6*invr5*f[2]*ndotr-30*x[1]*x[1]*invr7*f[2]*ndotr-30*x[1]*x[2]*invr7*f[1]*ndotr+6*invr5*fdotr*n[2]-30*x[1]*x[1]*invr7*fdotr*n[2]+6*x[1]*invr5*f[1]*n[2]-30*x[1]*x[2]*invr7*fdotr*n[1]+6*x[1]*invr5*f[2]*n[1]) * scal;
1434 v[17] += ((-30*x[1]*invr7*fdotr*ndotr)+210*x[1]*x[2]*x[2]*invr9*fdotr*ndotr-60*x[1]*x[2]*invr7*f[2]*ndotr-60*x[1]*x[2]*invr7*fdotr*n[2]+12*x[1]*invr5*f[2]*n[2] ) * scal;
1435 v[18] += ((-30*x[2]*invr7*fdotr*ndotr)+210*x[0]*x[0]*x[2]*invr9*fdotr*ndotr-60*x[0]*x[2]*invr7*f[0]*ndotr-60*x[0]*x[2]*invr7*fdotr*n[0]+12*x[2]*invr5*f[0]*n[0] ) * scal;
1436 v[19] += (210*x[0]*x[1]*x[2]*invr9*fdotr*ndotr-30*x[0]*x[2]*invr7*f[1]*ndotr-30*x[1]*x[2]*invr7*f[0]*ndotr-30*x[0]*x[2]*invr7*fdotr*n[1]+6*x[2]*invr5*f[0]*n[1]-30*x[1]*x[2]*invr7*fdotr*n[0]+6*x[2]*invr5*f[1]*n[0] ) * scal;
1437 v[20] += ((-30*x[0]*invr7*fdotr*ndotr)+210*x[0]*x[2]*x[2]*invr9*fdotr*ndotr-30*x[0]*x[2]*invr7*f[2]*ndotr+6*invr5*f[0]*ndotr-30*x[2]*x[2]*invr7*f[0]*ndotr-30*x[0]*x[2]*invr7*fdotr*n[2]+6*x[2]*invr5*f[0]*n[2]+6*invr5*fdotr*n[0]-30*x[2]*x[2]*invr7*fdotr*n[0]+6*x[2]*invr5*f[2]*n[0]) * scal;
1438 v[21] += (210*x[0]*x[1]*x[2]*invr9*fdotr*ndotr-30*x[0]*x[2]*invr7*f[1]*ndotr-30*x[1]*x[2]*invr7*f[0]*ndotr-30*x[0]*x[2]*invr7*fdotr*n[1]+6*x[2]*invr5*f[0]*n[1]-30*x[1]*x[2]*invr7*fdotr*n[0]+6*x[2]*invr5*f[1]*n[0] ) * scal;
1439 v[22] += ((-30*x[2]*invr7*fdotr*ndotr)+210*x[1]*x[1]*x[2]*invr9*fdotr*ndotr-60*x[1]*x[2]*invr7*f[1]*ndotr-60*x[1]*x[2]*invr7*fdotr*n[1]+12*x[2]*invr5*f[1]*n[1] ) * scal;
1440 v[23] += ((-30*x[1]*invr7*fdotr*ndotr)+210*x[1]*x[2]*x[2]*invr9*fdotr*ndotr-30*x[1]*x[2]*invr7*f[2]*ndotr+6*invr5*f[1]*ndotr-30*x[2]*x[2]*invr7*f[1]*ndotr-30*x[1]*x[2]*invr7*fdotr*n[2]+6*x[2]*invr5*f[1]*n[2]+6*invr5*fdotr*n[1]-30*x[2]*x[2]*invr7*fdotr*n[1]+6*x[2]*invr5*f[2]*n[1]) * scal;
1441 v[24] += ((-30*x[0]*invr7*fdotr*ndotr)+210*x[0]*x[2]*x[2]*invr9*fdotr*ndotr-30*x[0]*x[2]*invr7*f[2]*ndotr+6*invr5*f[0]*ndotr-30*x[2]*x[2]*invr7*f[0]*ndotr-30*x[0]*x[2]*invr7*fdotr*n[2]+6*x[2]*invr5*f[0]*n[2]+6*invr5*fdotr*n[0]-30*x[2]*x[2]*invr7*fdotr*n[0]+6*x[2]*invr5*f[2]*n[0]) * scal;
1442 v[25] += ((-30*x[1]*invr7*fdotr*ndotr)+210*x[1]*x[2]*x[2]*invr9*fdotr*ndotr-30*x[1]*x[2]*invr7*f[2]*ndotr+6*invr5*f[1]*ndotr-30*x[2]*x[2]*invr7*f[1]*ndotr-30*x[1]*x[2]*invr7*fdotr*n[2]+6*x[2]*invr5*f[1]*n[2]+6*invr5*fdotr*n[1]-30*x[2]*x[2]*invr7*fdotr*n[1]+6*x[2]*invr5*f[2]*n[1]) * scal;
1443 v[26] += ((-90*x[2]*invr7*fdotr*ndotr)+210*x[2]*x[2]*x[2]*invr9*fdotr*ndotr+12*invr5*f[2]*ndotr-60*x[2]*x[2]*invr7*f[2]*ndotr+12*invr5*fdotr*n[2]-60*x[2]*x[2]*invr7*fdotr*n[2]+12*x[2]*invr5*f[2]*n[2] ) * scal;
1445 return KernelFromLambda<k_dim0, k_dim1>(__FUNCTION__, micro_ker);
1449 constexpr Integer k_dim0 = DIM;
1450 constexpr Integer k_dim1 = DIM+1;
1451 static const ValueType scal = 1.0/(8.0*const_pi<ValueType>());
1452 static auto micro_ker = [&](ValueType* v,
const ValueType* x, ValueType invr,
const ValueType* n,
const ValueType* f) {
1453 ValueType fdotr = 0;
1454 ValueType ndotr = 0;
1455 ValueType ndotf = 0;
1456 ValueType invr2 = invr*invr;
1457 ValueType invr3 = invr2*invr;
1458 ValueType invr5 = invr2*invr3;
1459 ValueType invr7 = invr2*invr5;
1460 for(Integer k=0;k<DIM;k++) fdotr += f[k] * x[k];
1461 for(Integer k=0;k<DIM;k++) ndotr += n[k] * x[k];
1462 for(Integer k=0;k<DIM;k++) ndotf += n[k] * f[k];
1464 v[0] += 4*(invr3*(-ndotf)+3*invr5*fdotr*ndotr) * scal;
1465 v[1] += (6*x[0]*invr5*fdotr*ndotr) * scal;
1466 v[2] += (6*x[1]*invr5*fdotr*ndotr) * scal;
1467 v[3] += (6*x[2]*invr5*fdotr*ndotr) * scal;
1469 return KernelFromLambda<k_dim0, k_dim1>(__FUNCTION__, micro_ker);
1475 const auto& ker = *(std::function<typename KernelFunction<ValueType, DIM>::KerFn>*)(ctx);
1476 ker(r_src, n_src, v_src, r_trg, v_trg, NULL);
1479 template <Integer k_dim0, Integer k_dim1,
class LambdaType>
inline static const KernelFunction<ValueType, DIM>& KernelFromLambda(std::string name, LambdaType&& micro_ker) {
1481 Long src_cnt = r_src.Dim() / DIM;
1482 Long trg_cnt = r_trg.Dim() / DIM;
1483 for (Long t=0;t<trg_cnt;t++){
1484 ValueType v[k_dim1];
1485 for (Integer i=0;i<k_dim1;i++) {
1488 for (Long s=0;s<src_cnt;s++){
1489 ValueType dx[DIM], f[k_dim0], r2 = 0, invr = 0;
1490 for(Integer k=0;k<DIM;k++) {
1491 dx[k] = r_trg[t*DIM+k] - r_src[s*DIM+k];
1492 r2 += dx[k] * dx[k];
1494 for(Integer k=0;k<k_dim0;k++) {
1495 f[k] = v_src[s*k_dim0+k];
1497 if(r2>0) invr=1.0/sqrt(r2);
1498 micro_ker(v, dx, invr, &n_src[s*DIM], f);
1500 for (Integer i=0;i<k_dim1;i++) {
1501 v_trg[t*k_dim1+i] += v[i];
1540 static ValueType eps = machine_eps() * 128;
1541 size_t VecLen =
sizeof(Vec) /
sizeof(ValueType);
1544 size_t NWTN_ITER = 0;
1545 if (RSQRT_INTRIN == (Vec (*)(Vec))rsqrt_intrin0<Vec, ValueType>) NWTN_ITER = 0;
1546 if (RSQRT_INTRIN == (Vec (*)(Vec))rsqrt_intrin1<Vec, ValueType>) NWTN_ITER = 1;
1547 if (RSQRT_INTRIN == (Vec (*)(Vec))rsqrt_intrin2<Vec, ValueType>) NWTN_ITER = 2;
1548 if (RSQRT_INTRIN == (Vec (*)(Vec))rsqrt_intrin3<Vec, ValueType>) NWTN_ITER = 3;
1550 ValueType nwtn_scal = 1;
1551 for (
int i = 0; i < NWTN_ITER; i++) {
1552 nwtn_scal = 2 * nwtn_scal * nwtn_scal * nwtn_scal;
1554 const ValueType OOEP = 1.0 / (8 * nwtn_scal * const_pi<ValueType>());
1555 Vec inv_nwtn_scal2 = set_intrin<Vec, ValueType>(1.0 / (nwtn_scal * nwtn_scal));
1557 size_t src_cnt_ = src_coord.Dim(1);
1558 size_t trg_cnt_ = trg_coord.Dim(1);
1559 for (
size_t sblk = 0; sblk < src_cnt_; sblk += SRC_BLK) {
1560 size_t src_cnt = src_cnt_ - sblk;
1561 if (src_cnt > SRC_BLK) src_cnt = SRC_BLK;
1562 for (
size_t t = 0; t < trg_cnt_; t += VecLen) {
1563 Vec tx = load_intrin<Vec>(&trg_coord[0][t]);
1564 Vec ty = load_intrin<Vec>(&trg_coord[1][t]);
1565 Vec tz = load_intrin<Vec>(&trg_coord[2][t]);
1567 Vec tvx = zero_intrin<Vec>();
1568 Vec tvy = zero_intrin<Vec>();
1569 Vec tvz = zero_intrin<Vec>();
1570 for (
size_t s = sblk; s < sblk + src_cnt; s++) {
1571 Vec dx = sub_intrin(tx, bcast_intrin<Vec>(&src_coord[0][s]));
1572 Vec dy = sub_intrin(ty, bcast_intrin<Vec>(&src_coord[1][s]));
1573 Vec dz = sub_intrin(tz, bcast_intrin<Vec>(&src_coord[2][s]));
1575 Vec svx = bcast_intrin<Vec>(&src_value[0][s]);
1576 Vec svy = bcast_intrin<Vec>(&src_value[1][s]);
1577 Vec svz = bcast_intrin<Vec>(&src_value[2][s]);
1579 Vec r2 = mul_intrin(dx, dx);
1580 r2 = add_intrin(r2, mul_intrin(dy, dy));
1581 r2 = add_intrin(r2, mul_intrin(dz, dz));
1582 r2 = and_intrin(cmplt_intrin(set_intrin<Vec, ValueType>(eps), r2), r2);
1584 Vec rinv = RSQRT_INTRIN(r2);
1585 Vec rinv2 = mul_intrin(mul_intrin(rinv, rinv), inv_nwtn_scal2);
1587 Vec inner_prod = mul_intrin(svx, dx);
1588 inner_prod = add_intrin(inner_prod, mul_intrin(svy, dy));
1589 inner_prod = add_intrin(inner_prod, mul_intrin(svz, dz));
1590 inner_prod = mul_intrin(inner_prod, rinv2);
1592 tvx = add_intrin(tvx, mul_intrin(rinv, add_intrin(svx, mul_intrin(dx, inner_prod))));
1593 tvy = add_intrin(tvy, mul_intrin(rinv, add_intrin(svy, mul_intrin(dy, inner_prod))));
1594 tvz = add_intrin(tvz, mul_intrin(rinv, add_intrin(svz, mul_intrin(dz, inner_prod))));
1596 Vec ooep = set_intrin<Vec, ValueType>(OOEP);
1598 tvx = add_intrin(mul_intrin(tvx, ooep), load_intrin<Vec>(&trg_value[0][t]));
1599 tvy = add_intrin(mul_intrin(tvy, ooep), load_intrin<Vec>(&trg_value[1][t]));
1600 tvz = add_intrin(mul_intrin(tvz, ooep), load_intrin<Vec>(&trg_value[2][t]));
1602 store_intrin(&trg_value[0][t], tvx);
1603 store_intrin(&trg_value[1][t], tvy);
1604 store_intrin(&trg_value[2][t], tvz);
1618 size_t VecLen =
sizeof(Vec) /
sizeof(ValueType);
1621 size_t NWTN_ITER = 0;
1622 if (RSQRT_INTRIN == (Vec (*)(Vec))rsqrt_intrin0<Vec, ValueType>) NWTN_ITER = 0;
1623 if (RSQRT_INTRIN == (Vec (*)(Vec))rsqrt_intrin1<Vec, ValueType>) NWTN_ITER = 1;
1624 if (RSQRT_INTRIN == (Vec (*)(Vec))rsqrt_intrin2<Vec, ValueType>) NWTN_ITER = 2;
1625 if (RSQRT_INTRIN == (Vec (*)(Vec))rsqrt_intrin3<Vec, ValueType>) NWTN_ITER = 3;
1627 ValueType nwtn_scal = 1;
1628 for (
int i = 0; i < NWTN_ITER; i++) {
1629 nwtn_scal = 2 * nwtn_scal * nwtn_scal * nwtn_scal;
1631 const ValueType OOEP = 1.0 / (8 * nwtn_scal * const_pi<ValueType>());
1632 Vec inv_nwtn_scal2 = set_intrin<Vec, ValueType>(1.0 / (nwtn_scal * nwtn_scal));
1634 size_t src_cnt_ = src_coord.Dim(1);
1635 size_t trg_cnt_ = trg_coord.Dim(1);
1636 for (
size_t sblk = 0; sblk < src_cnt_; sblk += SRC_BLK) {
1637 size_t src_cnt = src_cnt_ - sblk;
1638 if (src_cnt > SRC_BLK) src_cnt = SRC_BLK;
1639 for (
size_t t = 0; t < trg_cnt_; t += VecLen) {
1640 Vec tx = load_intrin<Vec>(&trg_coord[0][t]);
1641 Vec ty = load_intrin<Vec>(&trg_coord[1][t]);
1642 Vec tz = load_intrin<Vec>(&trg_coord[2][t]);
1644 Vec tvx = zero_intrin<Vec>();
1645 Vec tvy = zero_intrin<Vec>();
1646 Vec tvz = zero_intrin<Vec>();
1647 for (
size_t s = sblk; s < sblk + src_cnt; s++) {
1648 Vec dx = sub_intrin(tx, bcast_intrin<Vec>(&src_coord[0][s]));
1649 Vec dy = sub_intrin(ty, bcast_intrin<Vec>(&src_coord[1][s]));
1650 Vec dz = sub_intrin(tz, bcast_intrin<Vec>(&src_coord[2][s]));
1652 Vec svx = bcast_intrin<Vec>(&src_value[0][s]);
1653 Vec svy = bcast_intrin<Vec>(&src_value[1][s]);
1654 Vec svz = bcast_intrin<Vec>(&src_value[2][s]);
1655 Vec inner_prod = bcast_intrin<Vec>(&src_value[3][s]);
1657 Vec r2 = mul_intrin(dx, dx);
1658 r2 = add_intrin(r2, mul_intrin(dy, dy));
1659 r2 = add_intrin(r2, mul_intrin(dz, dz));
1661 Vec rinv = RSQRT_INTRIN(r2);
1662 Vec rinv2 = mul_intrin(mul_intrin(rinv, rinv), inv_nwtn_scal2);
1664 inner_prod = add_intrin(inner_prod, mul_intrin(svx, dx));
1665 inner_prod = add_intrin(inner_prod, mul_intrin(svy, dy));
1666 inner_prod = add_intrin(inner_prod, mul_intrin(svz, dz));
1667 inner_prod = mul_intrin(inner_prod, rinv2);
1669 tvx = add_intrin(tvx, mul_intrin(rinv, add_intrin(svx, mul_intrin(dx, inner_prod))));
1670 tvy = add_intrin(tvy, mul_intrin(rinv, add_intrin(svy, mul_intrin(dy, inner_prod))));
1671 tvz = add_intrin(tvz, mul_intrin(rinv, add_intrin(svz, mul_intrin(dz, inner_prod))));
1673 Vec ooep = set_intrin<Vec, ValueType>(OOEP);
1675 tvx = add_intrin(mul_intrin(tvx, ooep), load_intrin<Vec>(&trg_value[0][t]));
1676 tvy = add_intrin(mul_intrin(tvy, ooep), load_intrin<Vec>(&trg_value[1][t]));
1677 tvz = add_intrin(mul_intrin(tvz, ooep), load_intrin<Vec>(&trg_value[2][t]));
1679 store_intrin(&trg_value[0][t], tvx);
1680 store_intrin(&trg_value[1][t], tvy);
1681 store_intrin(&trg_value[2][t], tvz);
1695 static ValueType eps = machine_eps() * 128;
1696 size_t VecLen =
sizeof(Vec) /
sizeof(ValueType);
1699 size_t NWTN_ITER = 0;
1700 if (RSQRT_INTRIN == (Vec (*)(Vec))rsqrt_intrin0<Vec, ValueType>) NWTN_ITER = 0;
1701 if (RSQRT_INTRIN == (Vec (*)(Vec))rsqrt_intrin1<Vec, ValueType>) NWTN_ITER = 1;
1702 if (RSQRT_INTRIN == (Vec (*)(Vec))rsqrt_intrin2<Vec, ValueType>) NWTN_ITER = 2;
1703 if (RSQRT_INTRIN == (Vec (*)(Vec))rsqrt_intrin3<Vec, ValueType>) NWTN_ITER = 3;
1705 ValueType nwtn_scal = 1;
1706 for (
int i = 0; i < NWTN_ITER; i++) {
1707 nwtn_scal = 2 * nwtn_scal * nwtn_scal * nwtn_scal;
1709 const ValueType SCAL_CONST = 3.0 / (4.0 * nwtn_scal * nwtn_scal * nwtn_scal * nwtn_scal * nwtn_scal * const_pi<ValueType>());
1711 size_t src_cnt_ = src_coord.Dim(1);
1712 size_t trg_cnt_ = trg_coord.Dim(1);
1713 for (
size_t sblk = 0; sblk < src_cnt_; sblk += SRC_BLK) {
1714 size_t src_cnt = src_cnt_ - sblk;
1715 if (src_cnt > SRC_BLK) src_cnt = SRC_BLK;
1716 for (
size_t t = 0; t < trg_cnt_; t += VecLen) {
1717 Vec tx = load_intrin<Vec>(&trg_coord[0][t]);
1718 Vec ty = load_intrin<Vec>(&trg_coord[1][t]);
1719 Vec tz = load_intrin<Vec>(&trg_coord[2][t]);
1721 Vec tvx = zero_intrin<Vec>();
1722 Vec tvy = zero_intrin<Vec>();
1723 Vec tvz = zero_intrin<Vec>();
1724 for (
size_t s = sblk; s < sblk + src_cnt; s++) {
1725 Vec dx = sub_intrin(tx, bcast_intrin<Vec>(&src_coord[0][s]));
1726 Vec dy = sub_intrin(ty, bcast_intrin<Vec>(&src_coord[1][s]));
1727 Vec dz = sub_intrin(tz, bcast_intrin<Vec>(&src_coord[2][s]));
1729 Vec snx = bcast_intrin<Vec>(&src_value[0][s]);
1730 Vec sny = bcast_intrin<Vec>(&src_value[1][s]);
1731 Vec snz = bcast_intrin<Vec>(&src_value[2][s]);
1733 Vec svx = bcast_intrin<Vec>(&src_norml[0][s]);
1734 Vec svy = bcast_intrin<Vec>(&src_norml[1][s]);
1735 Vec svz = bcast_intrin<Vec>(&src_norml[2][s]);
1737 Vec r2 = mul_intrin(dx, dx);
1738 r2 = add_intrin(r2, mul_intrin(dy, dy));
1739 r2 = add_intrin(r2, mul_intrin(dz, dz));
1740 r2 = and_intrin(cmplt_intrin(set_intrin<Vec, ValueType>(eps), r2), r2);
1742 Vec rinv = RSQRT_INTRIN(r2);
1743 Vec rinv2 = mul_intrin(rinv, rinv);
1744 Vec rinv5 = mul_intrin(mul_intrin(rinv2, rinv2), rinv);
1746 Vec r_dot_n = mul_intrin(snx, dx);
1747 r_dot_n = add_intrin(r_dot_n, mul_intrin(sny, dy));
1748 r_dot_n = add_intrin(r_dot_n, mul_intrin(snz, dz));
1750 Vec r_dot_f = mul_intrin(svx, dx);
1751 r_dot_f = add_intrin(r_dot_f, mul_intrin(svy, dy));
1752 r_dot_f = add_intrin(r_dot_f, mul_intrin(svz, dz));
1754 Vec p = mul_intrin(mul_intrin(r_dot_n, r_dot_f), rinv5);
1755 tvx = add_intrin(tvx, mul_intrin(dx, p));
1756 tvy = add_intrin(tvy, mul_intrin(dy, p));
1757 tvz = add_intrin(tvz, mul_intrin(dz, p));
1759 Vec scal_const = set_intrin<Vec, ValueType>(SCAL_CONST);
1761 tvx = add_intrin(mul_intrin(tvx, scal_const), load_intrin<Vec>(&trg_value[0][t]));
1762 tvy = add_intrin(mul_intrin(tvy, scal_const), load_intrin<Vec>(&trg_value[1][t]));
1763 tvz = add_intrin(mul_intrin(tvz, scal_const), load_intrin<Vec>(&trg_value[2][t]));
1765 store_intrin(&trg_value[0][t], tvx);
1766 store_intrin(&trg_value[1][t], tvy);
1767 store_intrin(&trg_value[2][t], tvz);
1781 static ValueType eps = machine_eps() * 128;
1782 size_t VecLen =
sizeof(Vec) /
sizeof(ValueType);
1785 size_t NWTN_ITER = 0;
1786 if (RSQRT_INTRIN == (Vec (*)(Vec))rsqrt_intrin0<Vec, ValueType>) NWTN_ITER = 0;
1787 if (RSQRT_INTRIN == (Vec (*)(Vec))rsqrt_intrin1<Vec, ValueType>) NWTN_ITER = 1;
1788 if (RSQRT_INTRIN == (Vec (*)(Vec))rsqrt_intrin2<Vec, ValueType>) NWTN_ITER = 2;
1789 if (RSQRT_INTRIN == (Vec (*)(Vec))rsqrt_intrin3<Vec, ValueType>) NWTN_ITER = 3;
1791 ValueType nwtn_scal = 1;
1792 for (
int i = 0; i < NWTN_ITER; i++) {
1793 nwtn_scal = 2 * nwtn_scal * nwtn_scal * nwtn_scal;
1795 const ValueType OOEP = 1.0 / (4 * nwtn_scal * nwtn_scal * nwtn_scal * const_pi<ValueType>());
1797 size_t src_cnt_ = src_coord.Dim(1);
1798 size_t trg_cnt_ = trg_coord.Dim(1);
1799 for (
size_t sblk = 0; sblk < src_cnt_; sblk += SRC_BLK) {
1800 size_t src_cnt = src_cnt_ - sblk;
1801 if (src_cnt > SRC_BLK) src_cnt = SRC_BLK;
1802 for (
size_t t = 0; t < trg_cnt_; t += VecLen) {
1803 Vec tx = load_intrin<Vec>(&trg_coord[0][t]);
1804 Vec ty = load_intrin<Vec>(&trg_coord[1][t]);
1805 Vec tz = load_intrin<Vec>(&trg_coord[2][t]);
1807 Vec tv = zero_intrin<Vec>();
1808 for (
size_t s = sblk; s < sblk + src_cnt; s++) {
1809 Vec dx = sub_intrin(tx, bcast_intrin<Vec>(&src_coord[0][s]));
1810 Vec dy = sub_intrin(ty, bcast_intrin<Vec>(&src_coord[1][s]));
1811 Vec dz = sub_intrin(tz, bcast_intrin<Vec>(&src_coord[2][s]));
1813 Vec svx = bcast_intrin<Vec>(&src_value[0][s]);
1814 Vec svy = bcast_intrin<Vec>(&src_value[1][s]);
1815 Vec svz = bcast_intrin<Vec>(&src_value[2][s]);
1817 Vec r2 = mul_intrin(dx, dx);
1818 r2 = add_intrin(r2, mul_intrin(dy, dy));
1819 r2 = add_intrin(r2, mul_intrin(dz, dz));
1820 r2 = and_intrin(cmplt_intrin(set_intrin<Vec, ValueType>(eps), r2), r2);
1822 Vec rinv = RSQRT_INTRIN(r2);
1823 Vec rinv3 = mul_intrin(mul_intrin(rinv, rinv), rinv);
1825 Vec inner_prod = mul_intrin(svx, dx);
1826 inner_prod = add_intrin(inner_prod, mul_intrin(svy, dy));
1827 inner_prod = add_intrin(inner_prod, mul_intrin(svz, dz));
1828 tv = add_intrin(tv, mul_intrin(inner_prod, rinv3));
1830 Vec ooep = set_intrin<Vec, ValueType>(OOEP);
1832 tv = add_intrin(mul_intrin(tv, ooep), load_intrin<Vec>(&trg_value[0][t]));
1833 store_intrin(&trg_value[0][t], tv);
1847 size_t VecLen =
sizeof(Vec) /
sizeof(ValueType);
1850 size_t NWTN_ITER = 0;
1851 if (RSQRT_INTRIN == (Vec (*)(Vec))rsqrt_intrin0<Vec, ValueType>) NWTN_ITER = 0;
1852 if (RSQRT_INTRIN == (Vec (*)(Vec))rsqrt_intrin1<Vec, ValueType>) NWTN_ITER = 1;
1853 if (RSQRT_INTRIN == (Vec (*)(Vec))rsqrt_intrin2<Vec, ValueType>) NWTN_ITER = 2;
1854 if (RSQRT_INTRIN == (Vec (*)(Vec))rsqrt_intrin3<Vec, ValueType>) NWTN_ITER = 3;
1856 ValueType nwtn_scal = 1;
1857 for (
int i = 0; i < NWTN_ITER; i++) {
1858 nwtn_scal = 2 * nwtn_scal * nwtn_scal * nwtn_scal;
1860 const ValueType OOEP = 1.0 / (4 * nwtn_scal * nwtn_scal * nwtn_scal * const_pi<ValueType>());
1862 size_t src_cnt_ = src_coord.Dim(1);
1863 size_t trg_cnt_ = trg_coord.Dim(1);
1864 for (
size_t sblk = 0; sblk < src_cnt_; sblk += SRC_BLK) {
1865 size_t src_cnt = src_cnt_ - sblk;
1866 if (src_cnt > SRC_BLK) src_cnt = SRC_BLK;
1867 for (
size_t t = 0; t < trg_cnt_; t += VecLen) {
1868 Vec tx = load_intrin<Vec>(&trg_coord[0][t]);
1869 Vec ty = load_intrin<Vec>(&trg_coord[1][t]);
1870 Vec tz = load_intrin<Vec>(&trg_coord[2][t]);
1872 Vec tv = zero_intrin<Vec>();
1873 for (
size_t s = sblk; s < sblk + src_cnt; s++) {
1874 Vec dx = sub_intrin(tx, bcast_intrin<Vec>(&src_coord[0][s]));
1875 Vec dy = sub_intrin(ty, bcast_intrin<Vec>(&src_coord[1][s]));
1876 Vec dz = sub_intrin(tz, bcast_intrin<Vec>(&src_coord[2][s]));
1878 Vec svx = bcast_intrin<Vec>(&src_value[0][s]);
1879 Vec svy = bcast_intrin<Vec>(&src_value[1][s]);
1880 Vec svz = bcast_intrin<Vec>(&src_value[2][s]);
1882 Vec r2 = mul_intrin(dx, dx);
1883 r2 = add_intrin(r2, mul_intrin(dy, dy));
1884 r2 = add_intrin(r2, mul_intrin(dz, dz));
1886 Vec rinv = RSQRT_INTRIN(r2);
1887 Vec rinv3 = mul_intrin(mul_intrin(rinv, rinv), rinv);
1889 Vec inner_prod = mul_intrin(svx, dx);
1890 inner_prod = add_intrin(inner_prod, mul_intrin(svy, dy));
1891 inner_prod = add_intrin(inner_prod, mul_intrin(svz, dz));
1892 tv = add_intrin(tv, mul_intrin(inner_prod, rinv3));
1894 Vec ooep = set_intrin<Vec, ValueType>(OOEP);
1896 tv = add_intrin(mul_intrin(tv, ooep), load_intrin<Vec>(&trg_value[0][t]));
1897 store_intrin(&trg_value[0][t], tv);
1910 static ValueType machine_eps() {
1912 while (eps * (ValueType)0.5 + (ValueType)1.0 > 1.0) eps *= 0.5;
1917 #define PVFMM_KER_NWTN(nwtn) \ 1918 if (newton_iter == nwtn) KernelFnWrapper::kernel_wrapper<ValueType, Real, Vec, DIM, 3, 3, Stokes3D<Real>::template sl_vel_uKernel<Vec, rsqrt_intrin##nwtn<Vec, Real>>>(r_src, n_src, v_src, r_trg, v_trg) 1919 #define PVFMM_KERNEL_MACRO \ 1920 PVFMM_KER_NWTN(0); \ 1921 PVFMM_KER_NWTN(1); \ 1922 PVFMM_KER_NWTN(2); \ 1928 #elif defined __AVX__ 1930 #elif defined __SSE3__ 1938 typedef double Real;
1941 #elif defined __AVX__ 1943 #elif defined __SSE3__ 1951 typedef ValueType Real;
1956 #undef PVFMM_KER_NWTN 1960 #define PVFMM_KER_NWTN(nwtn) \ 1961 if (newton_iter == nwtn) KernelFnWrapper::kernel_wrapper<ValueType, Real, Vec, DIM, 4, 3, Stokes3D<Real>::template sl_vel_m2x_uKernel<Vec, rsqrt_intrin##nwtn<Vec, Real>>>(r_src, n_src, v_src, r_trg, v_trg) 1962 #define PVFMM_KERNEL_MACRO \ 1963 PVFMM_KER_NWTN(0); \ 1964 PVFMM_KER_NWTN(1); \ 1965 PVFMM_KER_NWTN(2); \ 1971 #elif defined __AVX__ 1973 #elif defined __SSE3__ 1981 typedef double Real;
1984 #elif defined __AVX__ 1986 #elif defined __SSE3__ 1994 typedef ValueType Real;
1999 #undef PVFMM_KER_NWTN 2000 #undef PVFMM_KERNEL_MACRO 2004 #define PVFMM_KER_NWTN(nwtn) \ 2005 if (newton_iter == nwtn) KernelFnWrapper::kernel_wrapper<ValueType, Real, Vec, DIM, 3, 3, Stokes3D<Real>::template dl_vel_uKernel<Vec, rsqrt_intrin##nwtn<Vec, Real>>>(r_src, n_src, v_src, r_trg, v_trg) 2006 #define PVFMM_KERNEL_MACRO \ 2007 PVFMM_KER_NWTN(0); \ 2008 PVFMM_KER_NWTN(1); \ 2009 PVFMM_KER_NWTN(2); \ 2015 #elif defined __AVX__ 2017 #elif defined __SSE3__ 2025 typedef double Real;
2028 #elif defined __AVX__ 2030 #elif defined __SSE3__ 2038 typedef ValueType Real;
2043 #undef PVFMM_KER_NWTN 2044 #undef PVFMM_KERNEL_MACRO 2048 #define PVFMM_KER_NWTN(nwtn) \ 2049 if (newton_iter == nwtn) KernelFnWrapper::kernel_wrapper<ValueType, Real, Vec, DIM, 3, 1, Stokes3D<Real>::template sl_press_uKernel<Vec, rsqrt_intrin##nwtn<Vec, Real>>>(r_src, n_src, v_src, r_trg, v_trg) 2050 #define PVFMM_KERNEL_MACRO \ 2051 PVFMM_KER_NWTN(0); \ 2052 PVFMM_KER_NWTN(1); \ 2053 PVFMM_KER_NWTN(2); \ 2059 #elif defined __AVX__ 2061 #elif defined __SSE3__ 2069 typedef double Real;
2072 #elif defined __AVX__ 2074 #elif defined __SSE3__ 2082 typedef ValueType Real;
2087 #undef PVFMM_KER_NWTN 2091 #define PVFMM_KER_NWTN(nwtn) \ 2092 if (newton_iter == nwtn) KernelFnWrapper::kernel_wrapper<ValueType, Real, Vec, DIM, 4, 1, Stokes3D<Real>::template sl_press_m2x_uKernel<Vec, rsqrt_intrin##nwtn<Vec, Real>>>(r_src, n_src, v_src, r_trg, v_trg) 2093 #define PVFMM_KERNEL_MACRO \ 2094 PVFMM_KER_NWTN(0); \ 2095 PVFMM_KER_NWTN(1); \ 2096 PVFMM_KER_NWTN(2); \ 2102 #elif defined __AVX__ 2104 #elif defined __SSE3__ 2112 typedef double Real;
2115 #elif defined __AVX__ 2117 #elif defined __SSE3__ 2125 typedef ValueType Real;
2130 #undef PVFMM_KER_NWTN 2131 #undef PVFMM_KERNEL_MACRO 2139 static const Integer DIM = 3;
2142 constexpr Integer k_dim0 = DIM;
2143 constexpr Integer k_dim1 = DIM;
2144 static const ValueType sigma2 = 0.0001;
2145 static const ValueType scal = pvfmm::pow(2.0 * const_pi<ValueType>() * sigma2, -(DIM - 1) * 0.5);
2146 static auto micro_ker = [&](ValueType* v,
const ValueType* x, ValueType invr,
const ValueType* n,
const ValueType* f) {
2147 ValueType r2 = 1.0/(invr*invr);
2148 v[0] += exp(-r2/sigma2*0.5) * f[0] * scal;
2149 v[1] += exp(-r2/sigma2*0.5) * f[1] * scal;
2150 v[2] += exp(-r2/sigma2*0.5) * f[2] * scal;
2152 return KernelFromLambda<k_dim0, k_dim1>(__FUNCTION__, micro_ker);
2157 const auto& ker = *(std::function<typename KernelFunction<ValueType, DIM>::KerFn>*)(ctx);
2158 ker(r_src, n_src, v_src, r_trg, v_trg, NULL);
2161 template <Integer k_dim0, Integer k_dim1,
class LambdaType>
inline static const KernelFunction<ValueType, DIM>& KernelFromLambda(std::string name, LambdaType&& micro_ker) {
2163 Long src_cnt = r_src.Dim() / DIM;
2164 Long trg_cnt = r_trg.Dim() / DIM;
2165 for (Long t=0;t<trg_cnt;t++){
2166 ValueType v[k_dim1];
2167 for (Integer i=0;i<k_dim1;i++) {
2170 for (Long s=0;s<src_cnt;s++){
2171 ValueType dx[DIM], f[k_dim0], r2 = 0, invr = 0;
2172 for(Integer k=0;k<DIM;k++) {
2173 dx[k] = r_trg[t*DIM+k] - r_src[s*DIM+k];
2174 r2 += dx[k] * dx[k];
2176 for(Integer k=0;k<k_dim0;k++) {
2177 f[k] = v_src[s*k_dim0+k];
2179 if(r2>0) invr=1.0/sqrt(r2);
2180 micro_ker(v, dx, invr, &n_src[s*DIM], f);
2182 for (Integer i=0;i<k_dim1;i++) {
2183 v_trg[t*k_dim1+i] += v[i];
2194 #endif //_PVFMM_KERNEL_HPP_ Definition: matrix.hpp:13
Definition: cheb_utils.hpp:12
Definition: kernel.hpp:2137
Definition: matrix.hpp:15
Definition: kernel.hpp:1010
Definition: kernel.hpp:644
Definition: matrix.hpp:12
static void kernel_wrapper(const Vector< ValueType > &r_src, const Vector< ValueType > &n_src, const Vector< ValueType > &v_src, const Vector< ValueType > &r_trg, Vector< ValueType > &v_trg)
Generic kernel which rearranges data for vectorization, calls the actual uKernel and copies data to t...
Definition: kernel.hpp:650
Identify each type uniquely.
Definition: mem_mgr.hpp:271
Definition: kernel.hpp:768
Definition: kernel.hpp:14