@@ -27,45 +27,57 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
27
28
28
#include "common.h"
29
29
#if !defined(DOUBLE )
30
- #define VSETVL (n ) RISCV_RVV(vsetvl_e32m4)(n)
31
- #define FLOAT_V_T vfloat32m4_t
32
- #define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4)
33
- #define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4)
34
- #define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4)
35
- #define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4)
36
- #define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4)
37
- #define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m4)
30
+ #define VSETVL (n ) RISCV_RVV(vsetvl_e32m2)(n)
31
+ #define FLOAT_V_T vfloat32m2_t
32
+ #define VLEV_FLOAT RISCV_RVV(vle32_v_f32m2)
33
+ #define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m2)
34
+ #define VSEV_FLOAT RISCV_RVV(vse32_v_f32m2)
35
+ #define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m2)
36
+ #define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m2)
37
+ #define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m2)
38
+ #define VFMUL_VF_FLOAT RISCV_RVV(vfmul_vf_f32m2)
39
+ #define VSEV_FLOAT RISCV_RVV(vse32_v_f32m2)
38
40
#else
39
- #define VSETVL (n ) RISCV_RVV(vsetvl_e64m4)(n)
40
- #define FLOAT_V_T vfloat64m4_t
41
- #define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4)
42
- #define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4)
43
- #define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4)
44
- #define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4)
45
- #define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4)
46
- #define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4)
41
+ #define VSETVL (n ) RISCV_RVV(vsetvl_e64m2)(n)
42
+ #define FLOAT_V_T vfloat64m2_t
43
+ #define VLEV_FLOAT RISCV_RVV(vle64_v_f64m2)
44
+ #define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m2)
45
+ #define VSEV_FLOAT RISCV_RVV(vse64_v_f64m2)
46
+ #define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m2)
47
+ #define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m2)
48
+ #define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m2)
49
+ #define VFMUL_VF_FLOAT RISCV_RVV(vfmul_vf_f64m2)
50
+ #define VSEV_FLOAT RISCV_RVV(vse64_v_f64m2)
47
51
#endif
48
52
49
53
int CNAME (BLASLONG m , BLASLONG n , BLASLONG dummy1 , FLOAT alpha_r , FLOAT alpha_i , FLOAT * a , BLASLONG lda , FLOAT * x , BLASLONG inc_x , FLOAT * y , BLASLONG inc_y , FLOAT * buffer )
50
54
{
51
55
BLASLONG i = 0 , j = 0 , k = 0 ;
52
56
BLASLONG ix = 0 , iy = 0 ;
53
57
FLOAT * a_ptr = a ;
54
- FLOAT temp_r = 0.0 , temp_i = 0.0 ;
55
- FLOAT_V_T va0 , va1 , vy0 , vy1 ;
58
+ FLOAT temp_r = 0.0 , temp_i = 0.0 , temp_r1 , temp_i1 , temp_r2 , temp_i2 , temp_r3 , temp_i3 , temp_rr [ 4 ] , temp_ii [ 4 ] ;
59
+ FLOAT_V_T va0 , va1 , vy0 , vy1 , vy0_new , vy1_new , va2 , va3 , va4 , va5 , va6 , va7 , temp_iv , temp_rv , x_v0 , x_v1 , temp_v1 , temp_v2 , temp_v3 , temp_v4 ;
56
60
unsigned int gvl = 0 ;
57
61
BLASLONG stride_a = sizeof (FLOAT ) * 2 ;
58
62
BLASLONG stride_y = inc_y * sizeof (FLOAT ) * 2 ;
59
63
gvl = VSETVL (m );
60
64
BLASLONG inc_yv = inc_y * gvl * 2 ;
61
65
BLASLONG inc_x2 = inc_x * 2 ;
62
66
BLASLONG lda2 = lda * 2 ;
67
+ vy0_new = VLSEV_FLOAT (& y [iy ], stride_y , gvl );
68
+ vy1_new = VLSEV_FLOAT (& y [iy + 1 ], stride_y , gvl );
63
69
for (k = 0 ,j = 0 ; k < m /gvl ; k ++ ){
64
70
a_ptr = a ;
65
71
ix = 0 ;
66
- vy0 = VLSEV_FLOAT (& y [iy ], stride_y , gvl );
67
- vy1 = VLSEV_FLOAT (& y [iy + 1 ], stride_y , gvl );
68
- for (i = 0 ; i < n ; i ++ ){
72
+ vy0 = vy0_new ;
73
+ vy1 = vy1_new ;
74
+ // vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
75
+ // vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
76
+ if (k < m /gvl - 1 ){
77
+ vy0_new = VLSEV_FLOAT (& y [iy + inc_yv ], stride_y , gvl );
78
+ vy1_new = VLSEV_FLOAT (& y [iy + inc_yv + 1 ], stride_y , gvl );
79
+ }
80
+ for (i = 0 ; i < n %4 ; i ++ ){
69
81
#if !defined(XCONJ )
70
82
temp_r = alpha_r * x [ix ] - alpha_i * x [ix + 1 ];
71
83
temp_i = alpha_r * x [ix + 1 ] + alpha_i * x [ix ];
@@ -74,8 +86,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
74
86
temp_i = alpha_r * x [ix + 1 ] - alpha_i * x [ix ];
75
87
#endif
76
88
77
- va0 = VLSEV_FLOAT (& a_ptr [j ], stride_a , gvl );
78
- va1 = VLSEV_FLOAT (& a_ptr [j + 1 ], stride_a , gvl );
89
+ va0 = VLSEV_FLOAT (& a_ptr [j ], stride_a , gvl );
90
+ va1 = VLSEV_FLOAT (& a_ptr [j + 1 ], stride_a , gvl );
79
91
#if !defined(CONJ )
80
92
#if !defined(XCONJ )
81
93
vy0 = VFMACCVF_FLOAT (vy0 , temp_r , va0 , gvl );
@@ -108,6 +120,144 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
108
120
a_ptr += lda2 ;
109
121
ix += inc_x2 ;
110
122
}
123
+
124
+ for (; i < n ; i += 4 ){
125
+ #if !defined(XCONJ )
126
+
127
+
128
+ x_v0 = VLSEV_FLOAT (& x [ix ], inc_x2 * sizeof (FLOAT ), 4 );
129
+ x_v1 = VLSEV_FLOAT (& x [ix + 1 ], inc_x2 * sizeof (FLOAT ), 4 );
130
+ temp_rv = VFMUL_VF_FLOAT (x_v0 , alpha_r , 4 );
131
+ temp_iv = VFMUL_VF_FLOAT (x_v0 , alpha_i , 4 );
132
+ temp_rv = VFNMSACVF_FLOAT (temp_rv , alpha_i , x_v1 , 4 );
133
+ temp_iv = VFMACCVF_FLOAT (temp_iv , alpha_r , x_v1 , 4 );
134
+ VSEV_FLOAT (& temp_rr [0 ],temp_rv , 4 );
135
+ VSEV_FLOAT (& temp_ii [0 ],temp_iv , 4 );
136
+
137
+
138
+ #else
139
+ x_v0 = VLSEV_FLOAT (& x [ix ], inc_x2 * sizeof (FLOAT ), 4 );
140
+ x_v1 = VLSEV_FLOAT (& x [ix + 1 ], inc_x2 * sizeof (FLOAT ), 4 );
141
+ temp_rv = VFMUL_VF_FLOAT (x_v0 , alpha_r , 4 );
142
+ temp_iv = VFMUL_VF_FLOAT (x_v0 , alpha_i , 4 );
143
+ temp_rv = VFMACCVF_FLOAT (temp_rv , alpha_i , x_v1 , 4 );
144
+ temp_iv = VFNMSACVF_FLOAT (temp_iv , alpha_r , x_v1 , 4 );
145
+ VSEV_FLOAT (& temp_rr [0 ],temp_rv , 4 );
146
+ VSEV_FLOAT (& temp_ii [0 ],temp_iv , 4 );
147
+
148
+ #endif
149
+
150
+ va0 = VLSEV_FLOAT (& a_ptr [j ], stride_a , gvl );
151
+ va1 = VLSEV_FLOAT (& a_ptr [j + 1 ], stride_a , gvl );
152
+ va2 = VLSEV_FLOAT (& a_ptr [j + lda2 ], stride_a , gvl );
153
+ va3 = VLSEV_FLOAT (& a_ptr [j + lda2 + 1 ], stride_a , gvl );
154
+ va4 = VLSEV_FLOAT (& a_ptr [j + lda2 * 2 ], stride_a , gvl );
155
+ va5 = VLSEV_FLOAT (& a_ptr [j + lda2 * 2 + 1 ], stride_a , gvl );
156
+ va6 = VLSEV_FLOAT (& a_ptr [j + lda2 * 3 ], stride_a , gvl );
157
+ va7 = VLSEV_FLOAT (& a_ptr [j + lda2 * 3 + 1 ], stride_a , gvl );
158
+
159
+
160
+ #if !defined(CONJ )
161
+ #if !defined(XCONJ )
162
+ vy0 = VFMACCVF_FLOAT (vy0 , temp_rr [0 ], va0 , gvl );
163
+ vy0 = VFNMSACVF_FLOAT (vy0 , temp_ii [0 ], va1 , gvl );
164
+ vy1 = VFMACCVF_FLOAT (vy1 , temp_rr [0 ], va1 , gvl );
165
+ vy1 = VFMACCVF_FLOAT (vy1 , temp_ii [0 ], va0 , gvl );
166
+
167
+ vy0 = VFMACCVF_FLOAT (vy0 , temp_rr [1 ], va2 , gvl );
168
+ vy0 = VFNMSACVF_FLOAT (vy0 , temp_ii [1 ], va3 , gvl );
169
+ vy1 = VFMACCVF_FLOAT (vy1 , temp_rr [1 ], va3 , gvl );
170
+ vy1 = VFMACCVF_FLOAT (vy1 , temp_ii [1 ], va2 , gvl );
171
+
172
+ vy0 = VFMACCVF_FLOAT (vy0 , temp_rr [2 ], va4 , gvl );
173
+ vy0 = VFNMSACVF_FLOAT (vy0 , temp_ii [2 ], va5 , gvl );
174
+ vy1 = VFMACCVF_FLOAT (vy1 , temp_rr [2 ], va5 , gvl );
175
+ vy1 = VFMACCVF_FLOAT (vy1 , temp_ii [2 ], va4 , gvl );
176
+
177
+ vy0 = VFMACCVF_FLOAT (vy0 , temp_rr [3 ], va6 , gvl );
178
+ vy0 = VFNMSACVF_FLOAT (vy0 , temp_ii [3 ], va7 , gvl );
179
+ vy1 = VFMACCVF_FLOAT (vy1 , temp_rr [3 ], va7 , gvl );
180
+ vy1 = VFMACCVF_FLOAT (vy1 , temp_ii [3 ], va6 , gvl );
181
+
182
+
183
+ #else
184
+
185
+ vy0 = VFMACCVF_FLOAT (vy0 , temp_rr [0 ], va0 , gvl );
186
+ vy0 = VFMACCVF_FLOAT (vy0 , temp_ii [0 ], va1 , gvl );
187
+ vy1 = VFMACCVF_FLOAT (vy1 , temp_rr [0 ], va1 , gvl );
188
+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_ii [0 ], va0 , gvl );
189
+
190
+ vy0 = VFMACCVF_FLOAT (vy0 , temp_rr [1 ], va2 , gvl );
191
+ vy0 = VFMACCVF_FLOAT (vy0 , temp_ii [1 ], va3 , gvl );
192
+ vy1 = VFMACCVF_FLOAT (vy1 , temp_rr [1 ], va3 , gvl );
193
+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_ii [1 ], va2 , gvl );
194
+
195
+ vy0 = VFMACCVF_FLOAT (vy0 , temp_rr [2 ], va4 , gvl );
196
+ vy0 = VFMACCVF_FLOAT (vy0 , temp_ii [2 ], va5 , gvl );
197
+ vy1 = VFMACCVF_FLOAT (vy1 , temp_rr [2 ], va5 , gvl );
198
+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_ii [2 ], va4 , gvl );
199
+
200
+ vy0 = VFMACCVF_FLOAT (vy0 , temp_rr [3 ], va6 , gvl );
201
+ vy0 = VFMACCVF_FLOAT (vy0 , temp_ii [3 ], va7 , gvl );
202
+ vy1 = VFMACCVF_FLOAT (vy1 , temp_rr [3 ], va7 , gvl );
203
+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_ii [3 ], va6 , gvl );
204
+
205
+
206
+ #endif
207
+
208
+ #else
209
+
210
+ #if !defined(XCONJ )
211
+ vy0 = VFMACCVF_FLOAT (vy0 , temp_rr [0 ], va0 , gvl );
212
+ vy0 = VFMACCVF_FLOAT (vy0 , temp_ii [0 ], va1 , gvl );
213
+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_rr [0 ], va1 , gvl );
214
+ vy1 = VFMACCVF_FLOAT (vy1 , temp_ii [0 ], va0 , gvl );
215
+
216
+ vy0 = VFMACCVF_FLOAT (vy0 , temp_rr [1 ], va2 , gvl );
217
+ vy0 = VFMACCVF_FLOAT (vy0 , temp_ii [1 ], va3 , gvl );
218
+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_rr [1 ], va3 , gvl );
219
+ vy1 = VFMACCVF_FLOAT (vy1 , temp_ii [1 ], va2 , gvl );
220
+
221
+ vy0 = VFMACCVF_FLOAT (vy0 , temp_rr [2 ], va4 , gvl );
222
+ vy0 = VFMACCVF_FLOAT (vy0 , temp_ii [2 ], va5 , gvl );
223
+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_rr [2 ], va5 , gvl );
224
+ vy1 = VFMACCVF_FLOAT (vy1 , temp_ii [2 ], va4 , gvl );
225
+
226
+ vy0 = VFMACCVF_FLOAT (vy0 , temp_rr [3 ], va6 , gvl );
227
+ vy0 = VFMACCVF_FLOAT (vy0 , temp_ii [3 ], va7 , gvl );
228
+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_rr [3 ], va7 , gvl );
229
+ vy1 = VFMACCVF_FLOAT (vy1 , temp_ii [3 ], va6 , gvl );
230
+
231
+
232
+ #else
233
+ vy0 = VFMACCVF_FLOAT (vy0 , temp_rr [0 ], va0 , gvl );
234
+ vy0 = VFNMSACVF_FLOAT (vy0 , temp_ii [0 ], va1 , gvl );
235
+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_rr [0 ], va1 , gvl );
236
+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_ii [0 ], va0 , gvl );
237
+
238
+ vy0 = VFMACCVF_FLOAT (vy0 , temp_rr [1 ], va2 , gvl );
239
+ vy0 = VFNMSACVF_FLOAT (vy0 , temp_ii [1 ], va3 , gvl );
240
+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_rr [1 ], va3 , gvl );
241
+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_ii [1 ], va2 , gvl );
242
+
243
+ vy0 = VFMACCVF_FLOAT (vy0 , temp_rr [2 ], va4 , gvl );
244
+ vy0 = VFNMSACVF_FLOAT (vy0 , temp_ii [2 ], va5 , gvl );
245
+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_rr [2 ], va5 , gvl );
246
+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_ii [2 ], va4 , gvl );
247
+
248
+ vy0 = VFMACCVF_FLOAT (vy0 , temp_rr [3 ], va6 , gvl );
249
+ vy0 = VFNMSACVF_FLOAT (vy0 , temp_ii [3 ], va7 , gvl );
250
+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_rr [3 ], va7 , gvl );
251
+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_ii [3 ], va6 , gvl );
252
+
253
+ #endif
254
+
255
+ #endif
256
+ a_ptr += lda2 * 4 ;
257
+ ix += inc_x2 * 4 ;
258
+ }
259
+
260
+
111
261
VSSEV_FLOAT (& y [iy ], stride_y , vy0 , gvl );
112
262
VSSEV_FLOAT (& y [iy + 1 ], stride_y , vy1 , gvl );
113
263
j += gvl * 2 ;
@@ -171,3 +321,4 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
171
321
}
172
322
173
323
324
+
0 commit comments