8 #include <hmlp_internal.hpp> 14 inline GSKS_OPERATOR(
float)
const 16 printf(
"not implemented yet\n" );
28 const size_t pack_mr = 8;
29 const size_t pack_nr = 4;
30 const size_t align_size = 32;
31 const bool row_major =
false;
52 double alpha = ker->scal;
54 v4df_t c03_0, c03_1, c03_2, c03_3;
55 v4df_t c47_0, c47_1, c47_2, c47_3;
56 v4df_t tmpc03_0, tmpc03_1, tmpc03_2, tmpc03_3;
57 v4df_t tmpc47_0, tmpc47_1, tmpc47_2, tmpc47_3;
61 v4df_t c_tmp, aa_tmp, bb_tmp, w_tmp;
64 #include "component/rank_k_int_d8x4.hpp" 66 __asm__
volatile(
"prefetcht0 0(%0) \n\t" : :
"r"( aa ) );
67 __asm__
volatile(
"prefetcht0 0(%0) \n\t" : :
"r"( bb ) );
71 tmpc03_0.v = _mm256_load_pd( (
double*)( c ) );
72 tmpc47_0.v = _mm256_load_pd( (
double*)( c + 4 ) );
74 tmpc03_1.v = _mm256_load_pd( (
double*)( c + 1 * ldc ) );
75 tmpc47_1.v = _mm256_load_pd( (
double*)( c + 1 * ldc + 4 ) );
77 tmpc03_2.v = _mm256_load_pd( (
double*)( c + 2 * ldc ) );
78 tmpc47_2.v = _mm256_load_pd( (
double*)( c + 2 * ldc + 4 ) );
80 tmpc03_3.v = _mm256_load_pd( (
double*)( c + 3 * ldc ) );
81 tmpc47_3.v = _mm256_load_pd( (
double*)( c + 3 * ldc + 4 ) );
84 c03_0.v = _mm256_add_pd( tmpc03_0.v, c03_0.v );
85 c47_0.v = _mm256_add_pd( tmpc47_0.v, c47_0.v );
87 c03_1.v = _mm256_add_pd( tmpc03_1.v, c03_1.v );
88 c47_1.v = _mm256_add_pd( tmpc47_1.v, c47_1.v );
90 c03_2.v = _mm256_add_pd( tmpc03_2.v, c03_2.v );
91 c47_2.v = _mm256_add_pd( tmpc47_2.v, c47_2.v );
93 c03_3.v = _mm256_add_pd( tmpc03_3.v, c03_3.v );
94 c47_3.v = _mm256_add_pd( tmpc47_3.v, c47_3.v );
98 aa_tmp.v = _mm256_broadcast_sd( &neg2 );
99 c03_0.v = _mm256_mul_pd( aa_tmp.v, c03_0.v );
100 c03_1.v = _mm256_mul_pd( aa_tmp.v, c03_1.v );
101 c03_2.v = _mm256_mul_pd( aa_tmp.v, c03_2.v );
102 c03_3.v = _mm256_mul_pd( aa_tmp.v, c03_3.v );
103 c47_0.v = _mm256_mul_pd( aa_tmp.v, c47_0.v );
104 c47_1.v = _mm256_mul_pd( aa_tmp.v, c47_1.v );
105 c47_2.v = _mm256_mul_pd( aa_tmp.v, c47_2.v );
106 c47_3.v = _mm256_mul_pd( aa_tmp.v, c47_3.v );
109 aa_tmp.v = _mm256_load_pd( (
double*)aa );
110 c03_0.v = _mm256_add_pd( aa_tmp.v, c03_0.v );
111 c03_1.v = _mm256_add_pd( aa_tmp.v, c03_1.v );
112 c03_2.v = _mm256_add_pd( aa_tmp.v, c03_2.v );
113 c03_3.v = _mm256_add_pd( aa_tmp.v, c03_3.v );
116 aa_tmp.v = _mm256_load_pd( (
double*)( aa + 4 ) );
117 c47_0.v = _mm256_add_pd( aa_tmp.v, c47_0.v );
118 c47_1.v = _mm256_add_pd( aa_tmp.v, c47_1.v );
119 c47_2.v = _mm256_add_pd( aa_tmp.v, c47_2.v );
120 c47_3.v = _mm256_add_pd( aa_tmp.v, c47_3.v );
124 __asm__
volatile(
"prefetcht0 0(%0) \n\t" : :
"r"( u ) );
126 bb_tmp.v = _mm256_broadcast_sd( (
double*)bb );
127 c03_0.v = _mm256_add_pd( bb_tmp.v, c03_0.v );
128 c47_0.v = _mm256_add_pd( bb_tmp.v, c47_0.v );
130 bb_tmp.v = _mm256_broadcast_sd( (
double*)( bb + 1 ) );
131 c03_1.v = _mm256_add_pd( bb_tmp.v, c03_1.v );
132 c47_1.v = _mm256_add_pd( bb_tmp.v, c47_1.v );
134 bb_tmp.v = _mm256_broadcast_sd( (
double*)( bb + 2 ) );
135 c03_2.v = _mm256_add_pd( bb_tmp.v, c03_2.v );
136 c47_2.v = _mm256_add_pd( bb_tmp.v, c47_2.v );
138 bb_tmp.v = _mm256_broadcast_sd( (
double*)( bb + 3 ) );
139 c03_3.v = _mm256_add_pd( bb_tmp.v, c03_3.v );
140 c47_3.v = _mm256_add_pd( bb_tmp.v, c47_3.v );
144 c_tmp.v = _mm256_broadcast_sd( &dzero );
145 c03_0.v = _mm256_max_pd( c_tmp.v, c03_0.v );
146 c03_1.v = _mm256_max_pd( c_tmp.v, c03_1.v );
147 c03_2.v = _mm256_max_pd( c_tmp.v, c03_2.v );
148 c03_3.v = _mm256_max_pd( c_tmp.v, c03_3.v );
149 c47_0.v = _mm256_max_pd( c_tmp.v, c47_0.v );
150 c47_1.v = _mm256_max_pd( c_tmp.v, c47_1.v );
151 c47_2.v = _mm256_max_pd( c_tmp.v, c47_2.v );
152 c47_3.v = _mm256_max_pd( c_tmp.v, c47_3.v );
155 aa_tmp.v = _mm256_broadcast_sd( &alpha );
156 c03_0.v = _mm256_mul_pd( aa_tmp.v, c03_0.v );
157 c03_1.v = _mm256_mul_pd( aa_tmp.v, c03_1.v );
158 c03_2.v = _mm256_mul_pd( aa_tmp.v, c03_2.v );
159 c03_3.v = _mm256_mul_pd( aa_tmp.v, c03_3.v );
160 c47_0.v = _mm256_mul_pd( aa_tmp.v, c47_0.v );
161 c47_1.v = _mm256_mul_pd( aa_tmp.v, c47_1.v );
162 c47_2.v = _mm256_mul_pd( aa_tmp.v, c47_2.v );
163 c47_3.v = _mm256_mul_pd( aa_tmp.v, c47_3.v );
167 u03.v = _mm256_load_pd( (
double*)u );
168 u47.v = _mm256_load_pd( (
double*)( u + 4 ) );
172 __asm__
volatile(
"prefetcht0 0(%0) \n\t" : :
"r"( u + 8 ) );
173 __asm__
volatile(
"prefetcht0 0(%0) \n\t" : :
"r"( w ) );
176 #include "component/exp_int_d8x4.hpp" 179 #include "component/weighted_sum_int_d8x4.hpp" Definition: gsks_d8x4.hpp:12
GSKS_OPERATOR(double) const
Definition: gsks_d8x4.hpp:47
Definition: gsks_d8x4.hpp:24
Definition: avx_type.h:13