@@ -27,147 +27,294 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
27
28
28
#include "common.h"
29
29
#if !defined(DOUBLE )
30
- #define VSETVL (n ) RISCV_RVV(vsetvl_e32m4)(n)
31
- #define FLOAT_V_T vfloat32m4_t
32
- #define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4)
33
- #define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4)
34
- #define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4)
35
- #define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4)
36
- #define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4)
37
- #define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m4)
30
+ #define VSETVL (n ) RISCV_RVV(vsetvl_e32m2)(n)
31
+ #define FLOAT_V_T vfloat32m2_t
32
+ #define VLEV_FLOAT RISCV_RVV(vle32_v_f32m2)
33
+ #define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m2)
34
+ #define VSEV_FLOAT RISCV_RVV(vse32_v_f32m2)
35
+ #define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m2)
36
+ #define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m2)
37
+ #define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m2)
38
+ #define VFMUL_VF_FLOAT RISCV_RVV(vfmul_vf_f32m2)
39
+ #define VSEV_FLOAT RISCV_RVV(vse32_v_f32m2)
38
40
#else
39
- #define VSETVL (n ) RISCV_RVV(vsetvl_e64m4)(n)
40
- #define FLOAT_V_T vfloat64m4_t
41
- #define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4)
42
- #define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4)
43
- #define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4)
44
- #define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4)
45
- #define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4)
46
- #define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4)
41
+ #define VSETVL (n ) RISCV_RVV(vsetvl_e64m2)(n)
42
+ #define FLOAT_V_T vfloat64m2_t
43
+ #define VLEV_FLOAT RISCV_RVV(vle64_v_f64m2)
44
+ #define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m2)
45
+ #define VSEV_FLOAT RISCV_RVV(vse64_v_f64m2)
46
+ #define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m2)
47
+ #define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m2)
48
+ #define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m2)
49
+ #define VFMUL_VF_FLOAT RISCV_RVV(vfmul_vf_f64m2)
50
+ #define VSEV_FLOAT RISCV_RVV(vse64_v_f64m2)
47
51
#endif
48
52
49
53
int CNAME (BLASLONG m , BLASLONG n , BLASLONG dummy1 , FLOAT alpha_r , FLOAT alpha_i , FLOAT * a , BLASLONG lda , FLOAT * x , BLASLONG inc_x , FLOAT * y , BLASLONG inc_y , FLOAT * buffer )
50
54
{
51
- BLASLONG i = 0 , j = 0 , k = 0 ;
55
+ BLASLONG i = 0 , j = 0 , k = 0 ;
52
56
BLASLONG ix = 0 , iy = 0 ;
53
57
FLOAT * a_ptr = a ;
54
- FLOAT temp_r = 0.0 , temp_i = 0.0 ;
55
- FLOAT_V_T va0 , va1 , vy0 , vy1 ;
58
+ FLOAT temp_r = 0.0 , temp_i = 0.0 , temp_r1 , temp_i1 , temp_r2 , temp_i2 , temp_r3 , temp_i3 , temp_rr [ 4 ], temp_ii [ 4 ] ;
59
+ FLOAT_V_T va0 , va1 , vy0 , vy1 , vy0_new , vy1_new , va2 , va3 , va4 , va5 , va6 , va7 , temp_iv , temp_rv , x_v0 , x_v1 , temp_v1 , temp_v2 , temp_v3 , temp_v4 ;
56
60
unsigned int gvl = 0 ;
57
61
BLASLONG stride_a = sizeof (FLOAT ) * 2 ;
58
62
BLASLONG stride_y = inc_y * sizeof (FLOAT ) * 2 ;
59
63
gvl = VSETVL (m );
60
64
BLASLONG inc_yv = inc_y * gvl * 2 ;
61
65
BLASLONG inc_x2 = inc_x * 2 ;
62
66
BLASLONG lda2 = lda * 2 ;
63
- for (k = 0 ,j = 0 ; k < m /gvl ; k ++ ){
67
+ vy0_new = VLSEV_FLOAT (& y [iy ], stride_y , gvl );
68
+ vy1_new = VLSEV_FLOAT (& y [iy + 1 ], stride_y , gvl );
69
+ for (k = 0 , j = 0 ; k < m / gvl ; k ++ )
70
+ {
64
71
a_ptr = a ;
65
72
ix = 0 ;
66
- vy0 = VLSEV_FLOAT (& y [iy ], stride_y , gvl );
67
- vy1 = VLSEV_FLOAT (& y [iy + 1 ], stride_y , gvl );
68
- for (i = 0 ; i < n ; i ++ ){
73
+ vy0 = vy0_new ;
74
+ vy1 = vy1_new ;
75
+
76
+ if (k < m / gvl - 1 )
77
+ {
78
+ vy0_new = VLSEV_FLOAT (& y [iy + inc_yv ], stride_y , gvl );
79
+ vy1_new = VLSEV_FLOAT (& y [iy + inc_yv + 1 ], stride_y , gvl );
80
+ }
81
+ for (i = 0 ; i < n % 4 ; i ++ )
82
+ {
69
83
#if !defined(XCONJ )
70
- temp_r = alpha_r * x [ix ] - alpha_i * x [ix + 1 ];
71
- temp_i = alpha_r * x [ix + 1 ] + alpha_i * x [ix ];
84
+ temp_r = alpha_r * x [ix ] - alpha_i * x [ix + 1 ];
85
+ temp_i = alpha_r * x [ix + 1 ] + alpha_i * x [ix ];
72
86
#else
73
- temp_r = alpha_r * x [ix ] + alpha_i * x [ix + 1 ];
74
- temp_i = alpha_r * x [ix + 1 ] - alpha_i * x [ix ];
87
+ temp_r = alpha_r * x [ix ] + alpha_i * x [ix + 1 ];
88
+ temp_i = alpha_r * x [ix + 1 ] - alpha_i * x [ix ];
75
89
#endif
76
90
77
91
va0 = VLSEV_FLOAT (& a_ptr [j ], stride_a , gvl );
78
- va1 = VLSEV_FLOAT (& a_ptr [j + 1 ], stride_a , gvl );
92
+ va1 = VLSEV_FLOAT (& a_ptr [j + 1 ], stride_a , gvl );
79
93
#if !defined(CONJ )
80
94
#if !defined(XCONJ )
81
- vy0 = VFMACCVF_FLOAT (vy0 , temp_r , va0 , gvl );
82
- vy0 = VFNMSACVF_FLOAT (vy0 , temp_i , va1 , gvl );
83
- vy1 = VFMACCVF_FLOAT (vy1 , temp_r , va1 , gvl );
84
- vy1 = VFMACCVF_FLOAT (vy1 , temp_i , va0 , gvl );
95
+ vy0 = VFMACCVF_FLOAT (vy0 , temp_r , va0 , gvl );
96
+ vy0 = VFNMSACVF_FLOAT (vy0 , temp_i , va1 , gvl );
97
+ vy1 = VFMACCVF_FLOAT (vy1 , temp_r , va1 , gvl );
98
+ vy1 = VFMACCVF_FLOAT (vy1 , temp_i , va0 , gvl );
85
99
#else
86
100
87
- vy0 = VFMACCVF_FLOAT (vy0 , temp_r , va0 , gvl );
88
- vy0 = VFMACCVF_FLOAT (vy0 , temp_i , va1 , gvl );
89
- vy1 = VFMACCVF_FLOAT (vy1 , temp_r , va1 , gvl );
90
- vy1 = VFNMSACVF_FLOAT (vy1 , temp_i , va0 , gvl );
101
+ vy0 = VFMACCVF_FLOAT (vy0 , temp_r , va0 , gvl );
102
+ vy0 = VFMACCVF_FLOAT (vy0 , temp_i , va1 , gvl );
103
+ vy1 = VFMACCVF_FLOAT (vy1 , temp_r , va1 , gvl );
104
+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_i , va0 , gvl );
91
105
#endif
92
106
93
107
#else
94
108
95
109
#if !defined(XCONJ )
96
- vy0 = VFMACCVF_FLOAT (vy0 , temp_r , va0 , gvl );
97
- vy0 = VFMACCVF_FLOAT (vy0 , temp_i , va1 , gvl );
98
- vy1 = VFNMSACVF_FLOAT (vy1 , temp_r , va1 , gvl );
99
- vy1 = VFMACCVF_FLOAT (vy1 , temp_i , va0 , gvl );
110
+ vy0 = VFMACCVF_FLOAT (vy0 , temp_r , va0 , gvl );
111
+ vy0 = VFMACCVF_FLOAT (vy0 , temp_i , va1 , gvl );
112
+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_r , va1 , gvl );
113
+ vy1 = VFMACCVF_FLOAT (vy1 , temp_i , va0 , gvl );
100
114
#else
101
- vy0 = VFMACCVF_FLOAT (vy0 , temp_r , va0 , gvl );
102
- vy0 = VFNMSACVF_FLOAT (vy0 , temp_i , va1 , gvl );
103
- vy1 = VFNMSACVF_FLOAT (vy1 , temp_r , va1 , gvl );
104
- vy1 = VFNMSACVF_FLOAT (vy1 , temp_i , va0 , gvl );
115
+ vy0 = VFMACCVF_FLOAT (vy0 , temp_r , va0 , gvl );
116
+ vy0 = VFNMSACVF_FLOAT (vy0 , temp_i , va1 , gvl );
117
+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_r , va1 , gvl );
118
+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_i , va0 , gvl );
105
119
#endif
106
120
107
121
#endif
108
122
a_ptr += lda2 ;
109
123
ix += inc_x2 ;
110
124
}
125
+
126
+ for (; i < n ; i += 4 )
127
+ {
128
+ #if !defined(XCONJ )
129
+
130
+ x_v0 = VLSEV_FLOAT (& x [ix ], inc_x2 * sizeof (FLOAT ), 4 );
131
+ x_v1 = VLSEV_FLOAT (& x [ix + 1 ], inc_x2 * sizeof (FLOAT ), 4 );
132
+ temp_rv = VFMUL_VF_FLOAT (x_v0 , alpha_r , 4 );
133
+ temp_iv = VFMUL_VF_FLOAT (x_v0 , alpha_i , 4 );
134
+ temp_rv = VFNMSACVF_FLOAT (temp_rv , alpha_i , x_v1 , 4 );
135
+ temp_iv = VFMACCVF_FLOAT (temp_iv , alpha_r , x_v1 , 4 );
136
+ VSEV_FLOAT (& temp_rr [0 ], temp_rv , 4 );
137
+ VSEV_FLOAT (& temp_ii [0 ], temp_iv , 4 );
138
+
139
+ #else
140
+ x_v0 = VLSEV_FLOAT (& x [ix ], inc_x2 * sizeof (FLOAT ), 4 );
141
+ x_v1 = VLSEV_FLOAT (& x [ix + 1 ], inc_x2 * sizeof (FLOAT ), 4 );
142
+ temp_rv = VFMUL_VF_FLOAT (x_v0 , alpha_r , 4 );
143
+ temp_iv = VFMUL_VF_FLOAT (x_v0 , alpha_i , 4 );
144
+ temp_rv = VFMACCVF_FLOAT (temp_rv , alpha_i , x_v1 , 4 );
145
+ temp_iv = VFNMSACVF_FLOAT (temp_iv , alpha_r , x_v1 , 4 );
146
+ VSEV_FLOAT (& temp_rr [0 ], temp_rv , 4 );
147
+ VSEV_FLOAT (& temp_ii [0 ], temp_iv , 4 );
148
+
149
+ #endif
150
+
151
+ va0 = VLSEV_FLOAT (& a_ptr [j ], stride_a , gvl );
152
+ va1 = VLSEV_FLOAT (& a_ptr [j + 1 ], stride_a , gvl );
153
+ va2 = VLSEV_FLOAT (& a_ptr [j + lda2 ], stride_a , gvl );
154
+ va3 = VLSEV_FLOAT (& a_ptr [j + lda2 + 1 ], stride_a , gvl );
155
+ va4 = VLSEV_FLOAT (& a_ptr [j + lda2 * 2 ], stride_a , gvl );
156
+ va5 = VLSEV_FLOAT (& a_ptr [j + lda2 * 2 + 1 ], stride_a , gvl );
157
+ va6 = VLSEV_FLOAT (& a_ptr [j + lda2 * 3 ], stride_a , gvl );
158
+ va7 = VLSEV_FLOAT (& a_ptr [j + lda2 * 3 + 1 ], stride_a , gvl );
159
+
160
+ #if !defined(CONJ )
161
+ #if !defined(XCONJ )
162
+ vy0 = VFMACCVF_FLOAT (vy0 , temp_rr [0 ], va0 , gvl );
163
+ vy0 = VFNMSACVF_FLOAT (vy0 , temp_ii [0 ], va1 , gvl );
164
+ vy1 = VFMACCVF_FLOAT (vy1 , temp_rr [0 ], va1 , gvl );
165
+ vy1 = VFMACCVF_FLOAT (vy1 , temp_ii [0 ], va0 , gvl );
166
+
167
+ vy0 = VFMACCVF_FLOAT (vy0 , temp_rr [1 ], va2 , gvl );
168
+ vy0 = VFNMSACVF_FLOAT (vy0 , temp_ii [1 ], va3 , gvl );
169
+ vy1 = VFMACCVF_FLOAT (vy1 , temp_rr [1 ], va3 , gvl );
170
+ vy1 = VFMACCVF_FLOAT (vy1 , temp_ii [1 ], va2 , gvl );
171
+
172
+ vy0 = VFMACCVF_FLOAT (vy0 , temp_rr [2 ], va4 , gvl );
173
+ vy0 = VFNMSACVF_FLOAT (vy0 , temp_ii [2 ], va5 , gvl );
174
+ vy1 = VFMACCVF_FLOAT (vy1 , temp_rr [2 ], va5 , gvl );
175
+ vy1 = VFMACCVF_FLOAT (vy1 , temp_ii [2 ], va4 , gvl );
176
+
177
+ vy0 = VFMACCVF_FLOAT (vy0 , temp_rr [3 ], va6 , gvl );
178
+ vy0 = VFNMSACVF_FLOAT (vy0 , temp_ii [3 ], va7 , gvl );
179
+ vy1 = VFMACCVF_FLOAT (vy1 , temp_rr [3 ], va7 , gvl );
180
+ vy1 = VFMACCVF_FLOAT (vy1 , temp_ii [3 ], va6 , gvl );
181
+
182
+ #else
183
+
184
+ vy0 = VFMACCVF_FLOAT (vy0 , temp_rr [0 ], va0 , gvl );
185
+ vy0 = VFMACCVF_FLOAT (vy0 , temp_ii [0 ], va1 , gvl );
186
+ vy1 = VFMACCVF_FLOAT (vy1 , temp_rr [0 ], va1 , gvl );
187
+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_ii [0 ], va0 , gvl );
188
+
189
+ vy0 = VFMACCVF_FLOAT (vy0 , temp_rr [1 ], va2 , gvl );
190
+ vy0 = VFMACCVF_FLOAT (vy0 , temp_ii [1 ], va3 , gvl );
191
+ vy1 = VFMACCVF_FLOAT (vy1 , temp_rr [1 ], va3 , gvl );
192
+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_ii [1 ], va2 , gvl );
193
+
194
+ vy0 = VFMACCVF_FLOAT (vy0 , temp_rr [2 ], va4 , gvl );
195
+ vy0 = VFMACCVF_FLOAT (vy0 , temp_ii [2 ], va5 , gvl );
196
+ vy1 = VFMACCVF_FLOAT (vy1 , temp_rr [2 ], va5 , gvl );
197
+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_ii [2 ], va4 , gvl );
198
+
199
+ vy0 = VFMACCVF_FLOAT (vy0 , temp_rr [3 ], va6 , gvl );
200
+ vy0 = VFMACCVF_FLOAT (vy0 , temp_ii [3 ], va7 , gvl );
201
+ vy1 = VFMACCVF_FLOAT (vy1 , temp_rr [3 ], va7 , gvl );
202
+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_ii [3 ], va6 , gvl );
203
+
204
+ #endif
205
+
206
+ #else
207
+
208
+ #if !defined(XCONJ )
209
+ vy0 = VFMACCVF_FLOAT (vy0 , temp_rr [0 ], va0 , gvl );
210
+ vy0 = VFMACCVF_FLOAT (vy0 , temp_ii [0 ], va1 , gvl );
211
+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_rr [0 ], va1 , gvl );
212
+ vy1 = VFMACCVF_FLOAT (vy1 , temp_ii [0 ], va0 , gvl );
213
+
214
+ vy0 = VFMACCVF_FLOAT (vy0 , temp_rr [1 ], va2 , gvl );
215
+ vy0 = VFMACCVF_FLOAT (vy0 , temp_ii [1 ], va3 , gvl );
216
+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_rr [1 ], va3 , gvl );
217
+ vy1 = VFMACCVF_FLOAT (vy1 , temp_ii [1 ], va2 , gvl );
218
+
219
+ vy0 = VFMACCVF_FLOAT (vy0 , temp_rr [2 ], va4 , gvl );
220
+ vy0 = VFMACCVF_FLOAT (vy0 , temp_ii [2 ], va5 , gvl );
221
+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_rr [2 ], va5 , gvl );
222
+ vy1 = VFMACCVF_FLOAT (vy1 , temp_ii [2 ], va4 , gvl );
223
+
224
+ vy0 = VFMACCVF_FLOAT (vy0 , temp_rr [3 ], va6 , gvl );
225
+ vy0 = VFMACCVF_FLOAT (vy0 , temp_ii [3 ], va7 , gvl );
226
+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_rr [3 ], va7 , gvl );
227
+ vy1 = VFMACCVF_FLOAT (vy1 , temp_ii [3 ], va6 , gvl );
228
+
229
+ #else
230
+ vy0 = VFMACCVF_FLOAT (vy0 , temp_rr [0 ], va0 , gvl );
231
+ vy0 = VFNMSACVF_FLOAT (vy0 , temp_ii [0 ], va1 , gvl );
232
+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_rr [0 ], va1 , gvl );
233
+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_ii [0 ], va0 , gvl );
234
+
235
+ vy0 = VFMACCVF_FLOAT (vy0 , temp_rr [1 ], va2 , gvl );
236
+ vy0 = VFNMSACVF_FLOAT (vy0 , temp_ii [1 ], va3 , gvl );
237
+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_rr [1 ], va3 , gvl );
238
+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_ii [1 ], va2 , gvl );
239
+
240
+ vy0 = VFMACCVF_FLOAT (vy0 , temp_rr [2 ], va4 , gvl );
241
+ vy0 = VFNMSACVF_FLOAT (vy0 , temp_ii [2 ], va5 , gvl );
242
+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_rr [2 ], va5 , gvl );
243
+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_ii [2 ], va4 , gvl );
244
+
245
+ vy0 = VFMACCVF_FLOAT (vy0 , temp_rr [3 ], va6 , gvl );
246
+ vy0 = VFNMSACVF_FLOAT (vy0 , temp_ii [3 ], va7 , gvl );
247
+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_rr [3 ], va7 , gvl );
248
+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_ii [3 ], va6 , gvl );
249
+
250
+ #endif
251
+
252
+ #endif
253
+ a_ptr += lda2 * 4 ;
254
+ ix += inc_x2 * 4 ;
255
+ }
256
+
111
257
VSSEV_FLOAT (& y [iy ], stride_y , vy0 , gvl );
112
- VSSEV_FLOAT (& y [iy + 1 ], stride_y , vy1 , gvl );
258
+ VSSEV_FLOAT (& y [iy + 1 ], stride_y , vy1 , gvl );
113
259
j += gvl * 2 ;
114
260
iy += inc_yv ;
115
261
}
116
- //tail
117
- if (j /2 < m ){
118
- gvl = VSETVL (m - j /2 );
262
+ // tail
263
+ if (j / 2 < m )
264
+ {
265
+ gvl = VSETVL (m - j / 2 );
119
266
a_ptr = a ;
120
267
ix = 0 ;
121
268
vy0 = VLSEV_FLOAT (& y [iy ], stride_y , gvl );
122
- vy1 = VLSEV_FLOAT (& y [iy + 1 ], stride_y , gvl );
123
- for (i = 0 ; i < n ; i ++ ){
269
+ vy1 = VLSEV_FLOAT (& y [iy + 1 ], stride_y , gvl );
270
+ for (i = 0 ; i < n ; i ++ )
271
+ {
124
272
#if !defined(XCONJ )
125
- temp_r = alpha_r * x [ix ] - alpha_i * x [ix + 1 ];
126
- temp_i = alpha_r * x [ix + 1 ] + alpha_i * x [ix ];
273
+ temp_r = alpha_r * x [ix ] - alpha_i * x [ix + 1 ];
274
+ temp_i = alpha_r * x [ix + 1 ] + alpha_i * x [ix ];
127
275
#else
128
- temp_r = alpha_r * x [ix ] + alpha_i * x [ix + 1 ];
129
- temp_i = alpha_r * x [ix + 1 ] - alpha_i * x [ix ];
276
+ temp_r = alpha_r * x [ix ] + alpha_i * x [ix + 1 ];
277
+ temp_i = alpha_r * x [ix + 1 ] - alpha_i * x [ix ];
130
278
#endif
131
279
132
280
va0 = VLSEV_FLOAT (& a_ptr [j ], stride_a , gvl );
133
- va1 = VLSEV_FLOAT (& a_ptr [j + 1 ], stride_a , gvl );
281
+ va1 = VLSEV_FLOAT (& a_ptr [j + 1 ], stride_a , gvl );
134
282
#if !defined(CONJ )
135
283
136
284
#if !defined(XCONJ )
137
- vy0 = VFMACCVF_FLOAT (vy0 , temp_r , va0 , gvl );
138
- vy0 = VFNMSACVF_FLOAT (vy0 , temp_i , va1 , gvl );
139
- vy1 = VFMACCVF_FLOAT (vy1 , temp_r , va1 , gvl );
140
- vy1 = VFMACCVF_FLOAT (vy1 , temp_i , va0 , gvl );
285
+ vy0 = VFMACCVF_FLOAT (vy0 , temp_r , va0 , gvl );
286
+ vy0 = VFNMSACVF_FLOAT (vy0 , temp_i , va1 , gvl );
287
+ vy1 = VFMACCVF_FLOAT (vy1 , temp_r , va1 , gvl );
288
+ vy1 = VFMACCVF_FLOAT (vy1 , temp_i , va0 , gvl );
141
289
#else
142
290
143
- vy0 = VFMACCVF_FLOAT (vy0 , temp_r , va0 , gvl );
144
- vy0 = VFMACCVF_FLOAT (vy0 , temp_i , va1 , gvl );
145
- vy1 = VFMACCVF_FLOAT (vy1 , temp_r , va1 , gvl );
146
- vy1 = VFNMSACVF_FLOAT (vy1 , temp_i , va0 , gvl );
291
+ vy0 = VFMACCVF_FLOAT (vy0 , temp_r , va0 , gvl );
292
+ vy0 = VFMACCVF_FLOAT (vy0 , temp_i , va1 , gvl );
293
+ vy1 = VFMACCVF_FLOAT (vy1 , temp_r , va1 , gvl );
294
+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_i , va0 , gvl );
147
295
#endif
148
296
149
297
#else
150
298
151
299
#if !defined(XCONJ )
152
- vy0 = VFMACCVF_FLOAT (vy0 , temp_r , va0 , gvl );
153
- vy0 = VFMACCVF_FLOAT (vy0 , temp_i , va1 , gvl );
154
- vy1 = VFNMSACVF_FLOAT (vy1 , temp_r , va1 , gvl );
155
- vy1 = VFMACCVF_FLOAT (vy1 , temp_i , va0 , gvl );
300
+ vy0 = VFMACCVF_FLOAT (vy0 , temp_r , va0 , gvl );
301
+ vy0 = VFMACCVF_FLOAT (vy0 , temp_i , va1 , gvl );
302
+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_r , va1 , gvl );
303
+ vy1 = VFMACCVF_FLOAT (vy1 , temp_i , va0 , gvl );
156
304
#else
157
- vy0 = VFMACCVF_FLOAT (vy0 , temp_r , va0 , gvl );
158
- vy0 = VFNMSACVF_FLOAT (vy0 , temp_i , va1 , gvl );
159
- vy1 = VFNMSACVF_FLOAT (vy1 , temp_r , va1 , gvl );
160
- vy1 = VFNMSACVF_FLOAT (vy1 , temp_i , va0 , gvl );
305
+ vy0 = VFMACCVF_FLOAT (vy0 , temp_r , va0 , gvl );
306
+ vy0 = VFNMSACVF_FLOAT (vy0 , temp_i , va1 , gvl );
307
+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_r , va1 , gvl );
308
+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_i , va0 , gvl );
161
309
#endif
162
310
163
311
#endif
164
312
a_ptr += lda2 ;
165
313
ix += inc_x2 ;
166
314
}
167
315
VSSEV_FLOAT (& y [iy ], stride_y , vy0 , gvl );
168
- VSSEV_FLOAT (& y [iy + 1 ], stride_y , vy1 , gvl );
316
+ VSSEV_FLOAT (& y [iy + 1 ], stride_y , vy1 , gvl );
169
317
}
170
- return (0 );
318
+ return (0 );
171
319
}
172
320
173
-
0 commit comments