8 for ( i = 0; i < rhs; i ++ ) {
9 A03.v = _mm256_load_pd( u + 8 );
10 A47.v = _mm256_load_pd( u + 12 );
11 __asm__
volatile(
"prefetcht0 0(%0) \n\t" : :
"r"( u + 16 ) );
12 __asm__
volatile(
"prefetcht0 0(%0) \n\t" : :
"r"( w + 4 ) );
14 w_tmp.v = _mm256_broadcast_sd( (
double*)w );
15 a03.v = _mm256_mul_pd( w_tmp.v, c03_0.v );
16 a47.v = _mm256_mul_pd( w_tmp.v, c47_0.v );
17 u03.v = _mm256_add_pd( u03.v, a03.v );
18 u47.v = _mm256_add_pd( u47.v, a47.v );
20 w_tmp.v = _mm256_broadcast_sd( (
double*)( w + 1 ) );
21 a03.v = _mm256_mul_pd( w_tmp.v, c03_1.v );
22 a47.v = _mm256_mul_pd( w_tmp.v, c47_1.v );
23 u03.v = _mm256_add_pd( u03.v, a03.v );
24 u47.v = _mm256_add_pd( u47.v, a47.v );
26 w_tmp.v = _mm256_broadcast_sd( (
double*)( w + 2 ) );
27 a03.v = _mm256_mul_pd( w_tmp.v, c03_2.v );
28 a47.v = _mm256_mul_pd( w_tmp.v, c47_2.v );
29 u03.v = _mm256_add_pd( u03.v, a03.v );
30 u47.v = _mm256_add_pd( u47.v, a47.v );
32 w_tmp.v = _mm256_broadcast_sd( (
double*)( w + 3 ) );
33 a03.v = _mm256_mul_pd( w_tmp.v, c03_3.v );
34 a47.v = _mm256_mul_pd( w_tmp.v, c47_3.v );
35 u03.v = _mm256_add_pd( u03.v, a03.v );
36 u47.v = _mm256_add_pd( u47.v, a47.v );
38 _mm256_store_pd( u , u03.v );
39 _mm256_store_pd( u + 4 , u47.v );
40 u03.v = _mm256_load_pd( u + 16 );
41 u47.v = _mm256_load_pd( u + 20 );
42 __asm__
volatile(
"prefetcht0 0(%0) \n\t" : :
"r"( u + 24 ) );
43 __asm__
volatile(
"prefetcht0 0(%0) \n\t" : :
"r"( w + 8 ) );
45 w_tmp.v = _mm256_broadcast_sd( (
double*)( w + 4 ) );
46 a03.v = _mm256_mul_pd( w_tmp.v, c03_0.v );
47 a47.v = _mm256_mul_pd( w_tmp.v, c47_0.v );
48 A03.v = _mm256_add_pd( A03.v, a03.v );
49 A47.v = _mm256_add_pd( A47.v, a47.v );
51 w_tmp.v = _mm256_broadcast_sd( (
double*)( w + 5 ) );
52 a03.v = _mm256_mul_pd( w_tmp.v, c03_1.v );
53 a47.v = _mm256_mul_pd( w_tmp.v, c47_1.v );
54 A03.v = _mm256_add_pd( A03.v, a03.v );
55 A47.v = _mm256_add_pd( A47.v, a47.v );
57 w_tmp.v = _mm256_broadcast_sd( (
double*)( w + 6 ) );
58 a03.v = _mm256_mul_pd( w_tmp.v, c03_2.v );
59 a47.v = _mm256_mul_pd( w_tmp.v, c47_2.v );
60 A03.v = _mm256_add_pd( A03.v, a03.v );
61 A47.v = _mm256_add_pd( A47.v, a47.v );
63 w_tmp.v = _mm256_broadcast_sd( (
double*)( w + 7 ) );
64 a03.v = _mm256_mul_pd( w_tmp.v, c03_3.v );
65 a47.v = _mm256_mul_pd( w_tmp.v, c47_3.v );
66 A03.v = _mm256_add_pd( A03.v, a03.v );
67 A47.v = _mm256_add_pd( A47.v, a47.v );
69 _mm256_store_pd( u + 8, A03.v );
70 _mm256_store_pd( u + 12, A47.v );
78 w_tmp.v = _mm256_broadcast_sd( (
double*)w );
79 c03_0.v = _mm256_mul_pd( w_tmp.v, c03_0.v );
80 c47_0.v = _mm256_mul_pd( w_tmp.v, c47_0.v );
81 u03.v = _mm256_add_pd( u03.v, c03_0.v );
82 u47.v = _mm256_add_pd( u47.v, c47_0.v );
84 w_tmp.v = _mm256_broadcast_sd( (
double*)( w + 1 ) );
85 c03_1.v = _mm256_mul_pd( w_tmp.v, c03_1.v );
86 c47_1.v = _mm256_mul_pd( w_tmp.v, c47_1.v );
87 u03.v = _mm256_add_pd( u03.v, c03_1.v );
88 u47.v = _mm256_add_pd( u47.v, c47_1.v );
90 w_tmp.v = _mm256_broadcast_sd( (
double*)( w + 2 ) );
91 c03_2.v = _mm256_mul_pd( w_tmp.v, c03_2.v );
92 c47_2.v = _mm256_mul_pd( w_tmp.v, c47_2.v );
93 u03.v = _mm256_add_pd( u03.v, c03_2.v );
94 u47.v = _mm256_add_pd( u47.v, c47_2.v );
96 w_tmp.v = _mm256_broadcast_sd( (
double*)( w + 3 ) );
97 c03_3.v = _mm256_mul_pd( w_tmp.v, c03_3.v );
98 c47_3.v = _mm256_mul_pd( w_tmp.v, c47_3.v );
99 u03.v = _mm256_add_pd( u03.v, c03_3.v );
100 u47.v = _mm256_add_pd( u47.v, c47_3.v );
102 _mm256_store_pd( u , u03.v );
103 _mm256_store_pd( u + 4 , u47.v );