7 #include <hmlp_internal.hpp> 68 inline void operator()(
73 double *a,
double *aa,
74 double *b,
double *bb,
80 double alpha = ker->scal;
83 v8df_t c07_0, c07_1, c07_2, c07_3, c07_4, c07_5, c07_6, c07_7;
84 v8df_t c15_0, c15_1, c15_2, c15_3, c15_4, c15_5, c15_6, c15_7;
85 v8df_t c23_0, c23_1, c23_2, c23_3, c23_4, c23_5, c23_6, c23_7;
99 #include <rank_k_int_d24x8.segment> 114 #include <sq2nrm_int_d24x8.segment> 128 a07.v = _mm512_set1_pd( alpha );
129 c07_0.v = _mm512_mul_pd( a07.v, c07_0.v );
130 c07_1.v = _mm512_mul_pd( a07.v, c07_1.v );
131 c07_2.v = _mm512_mul_pd( a07.v, c07_2.v );
132 c07_3.v = _mm512_mul_pd( a07.v, c07_3.v );
133 c07_4.v = _mm512_mul_pd( a07.v, c07_4.v );
134 c07_5.v = _mm512_mul_pd( a07.v, c07_5.v );
135 c07_6.v = _mm512_mul_pd( a07.v, c07_6.v );
136 c07_7.v = _mm512_mul_pd( a07.v, c07_7.v );
138 c15_0.v = _mm512_mul_pd( a07.v, c15_0.v );
139 c15_1.v = _mm512_mul_pd( a07.v, c15_1.v );
140 c15_2.v = _mm512_mul_pd( a07.v, c15_2.v );
141 c15_3.v = _mm512_mul_pd( a07.v, c15_3.v );
142 c15_4.v = _mm512_mul_pd( a07.v, c15_4.v );
143 c15_5.v = _mm512_mul_pd( a07.v, c15_5.v );
144 c15_6.v = _mm512_mul_pd( a07.v, c15_6.v );
145 c15_7.v = _mm512_mul_pd( a07.v, c15_7.v );
147 c23_0.v = _mm512_mul_pd( a07.v, c23_0.v );
148 c23_1.v = _mm512_mul_pd( a07.v, c23_1.v );
149 c23_2.v = _mm512_mul_pd( a07.v, c23_2.v );
150 c23_3.v = _mm512_mul_pd( a07.v, c23_3.v );
151 c23_4.v = _mm512_mul_pd( a07.v, c23_4.v );
152 c23_5.v = _mm512_mul_pd( a07.v, c23_5.v );
153 c23_6.v = _mm512_mul_pd( a07.v, c23_6.v );
154 c23_7.v = _mm512_mul_pd( a07.v, c23_7.v );
171 __asm__
volatile(
"prefetcht0 0(%0) \n\t" : :
"r"( u ) );
172 __asm__
volatile(
"prefetcht0 64(%0) \n\t" : :
"r"( u ) );
173 __asm__
volatile(
"prefetcht0 128(%0) \n\t" : :
"r"( u ) );
174 __asm__
volatile(
"prefetcht0 0(%0) \n\t" : :
"r"( w ) );
179 #include "exp_int_d24x8.h" 183 c07_0.v = _mm512_exp_pd( c07_0.v );
184 c07_1.v = _mm512_exp_pd( c07_1.v );
185 c07_2.v = _mm512_exp_pd( c07_2.v );
186 c07_3.v = _mm512_exp_pd( c07_3.v );
187 c07_4.v = _mm512_exp_pd( c07_4.v );
188 c07_5.v = _mm512_exp_pd( c07_5.v );
189 c07_6.v = _mm512_exp_pd( c07_6.v );
190 c07_7.v = _mm512_exp_pd( c07_7.v );
192 c15_0.v = _mm512_exp_pd( c15_0.v );
193 c15_1.v = _mm512_exp_pd( c15_1.v );
194 c15_2.v = _mm512_exp_pd( c15_2.v );
195 c15_3.v = _mm512_exp_pd( c15_3.v );
196 c15_4.v = _mm512_exp_pd( c15_4.v );
197 c15_5.v = _mm512_exp_pd( c15_5.v );
198 c15_6.v = _mm512_exp_pd( c15_6.v );
199 c15_7.v = _mm512_exp_pd( c15_7.v );
201 c23_0.v = _mm512_exp_pd( c23_0.v );
202 c23_1.v = _mm512_exp_pd( c23_1.v );
203 c23_2.v = _mm512_exp_pd( c23_2.v );
204 c23_3.v = _mm512_exp_pd( c23_3.v );
205 c23_4.v = _mm512_exp_pd( c23_4.v );
206 c23_5.v = _mm512_exp_pd( c23_5.v );
207 c23_6.v = _mm512_exp_pd( c23_6.v );
208 c23_7.v = _mm512_exp_pd( c23_7.v );
224 a07.v = _mm512_load_pd( u );
225 a15.v = _mm512_load_pd( u + 8 );
226 a23.v = _mm512_load_pd( u + 16 );
229 #include<weighted_sum_int_d24x8.segment> Definition: gaussian_d24x8.hpp:66
Definition: hmlp_internal.hpp:38