Skip to content

Commit 7616c42

Browse files
committed
Optimized RVV_ZVL256B Implementation of zgemv_n
The implementation of zgemv_n using RVV_ZVL256B has been optimized. Compared to the previous implementation, it has achieved a 1.5x performance improvement.
1 parent dd38b4e commit 7616c42

File tree

1 file changed

+174
-23
lines changed

1 file changed

+174
-23
lines changed

kernel/riscv64/zgemv_n_vector.c

Lines changed: 174 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -27,45 +27,57 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2727

2828
#include "common.h"
2929
#if !defined(DOUBLE)
30-
#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n)
31-
#define FLOAT_V_T vfloat32m4_t
32-
#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4)
33-
#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4)
34-
#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4)
35-
#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4)
36-
#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4)
37-
#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m4)
30+
#define VSETVL(n) RISCV_RVV(vsetvl_e32m2)(n)
31+
#define FLOAT_V_T vfloat32m2_t
32+
#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m2)
33+
#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m2)
34+
#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m2)
35+
#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m2)
36+
#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m2)
37+
#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m2)
38+
#define VFMUL_VF_FLOAT RISCV_RVV(vfmul_vf_f32m2)
39+
#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m2)
3840
#else
39-
#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n)
40-
#define FLOAT_V_T vfloat64m4_t
41-
#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4)
42-
#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4)
43-
#define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4)
44-
#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4)
45-
#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4)
46-
#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4)
41+
#define VSETVL(n) RISCV_RVV(vsetvl_e64m2)(n)
42+
#define FLOAT_V_T vfloat64m2_t
43+
#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m2)
44+
#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m2)
45+
#define VSEV_FLOAT RISCV_RVV(vse64_v_f64m2)
46+
#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m2)
47+
#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m2)
48+
#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m2)
49+
#define VFMUL_VF_FLOAT RISCV_RVV(vfmul_vf_f64m2)
50+
#define VSEV_FLOAT RISCV_RVV(vse64_v_f64m2)
4751
#endif
4852

4953
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
5054
{
5155
BLASLONG i = 0, j = 0, k = 0;
5256
BLASLONG ix = 0, iy = 0;
5357
FLOAT *a_ptr = a;
54-
FLOAT temp_r = 0.0, temp_i = 0.0;
55-
FLOAT_V_T va0, va1, vy0, vy1;
58+
FLOAT temp_r = 0.0, temp_i = 0.0 ,temp_r1 ,temp_i1, temp_r2 ,temp_i2 ,temp_r3, temp_i3,temp_rr[4] ,temp_ii[4];
59+
FLOAT_V_T va0, va1, vy0, vy1,vy0_new, vy1_new, va2 , va3 , va4 , va5, va6 , va7, temp_iv , temp_rv,x_v0 , x_v1,temp_v1 , temp_v2 , temp_v3 , temp_v4;
5660
unsigned int gvl = 0;
5761
BLASLONG stride_a = sizeof(FLOAT) * 2;
5862
BLASLONG stride_y = inc_y * sizeof(FLOAT) * 2;
5963
gvl = VSETVL(m);
6064
BLASLONG inc_yv = inc_y * gvl * 2;
6165
BLASLONG inc_x2 = inc_x * 2;
6266
BLASLONG lda2 = lda * 2;
67+
vy0_new = VLSEV_FLOAT(&y[iy], stride_y, gvl);
68+
vy1_new = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
6369
for(k=0,j=0; k<m/gvl; k++){
6470
a_ptr = a;
6571
ix = 0;
66-
vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
67-
vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
68-
for(i = 0; i < n; i++){
72+
vy0 = vy0_new;
73+
vy1 = vy1_new;
74+
// vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
75+
// vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
76+
if(k < m/gvl - 1){
77+
vy0_new = VLSEV_FLOAT(&y[iy + inc_yv ], stride_y, gvl);
78+
vy1_new = VLSEV_FLOAT(&y[iy+ inc_yv + 1], stride_y, gvl);
79+
}
80+
for(i = 0; i < n %4; i++){
6981
#if !defined(XCONJ)
7082
temp_r = alpha_r * x[ix] - alpha_i * x[ix+1];
7183
temp_i = alpha_r * x[ix+1] + alpha_i * x[ix];
@@ -74,8 +86,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
7486
temp_i = alpha_r * x[ix+1] - alpha_i * x[ix];
7587
#endif
7688

77-
va0 = VLSEV_FLOAT(&a_ptr[j], stride_a, gvl);
78-
va1 = VLSEV_FLOAT(&a_ptr[j+1], stride_a, gvl);
89+
va0 = VLSEV_FLOAT(&a_ptr[j], stride_a, gvl);
90+
va1 = VLSEV_FLOAT(&a_ptr[j+1], stride_a, gvl);
7991
#if !defined(CONJ)
8092
#if !defined(XCONJ)
8193
vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl);
@@ -108,6 +120,144 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
108120
a_ptr += lda2;
109121
ix += inc_x2;
110122
}
123+
124+
for(; i < n ; i+=4){
125+
#if !defined(XCONJ)
126+
127+
128+
x_v0 = VLSEV_FLOAT(&x[ix], inc_x2 * sizeof(FLOAT), 4);
129+
x_v1 = VLSEV_FLOAT(&x[ix+1], inc_x2 * sizeof(FLOAT), 4);
130+
temp_rv = VFMUL_VF_FLOAT(x_v0, alpha_r, 4);
131+
temp_iv = VFMUL_VF_FLOAT(x_v0, alpha_i, 4);
132+
temp_rv = VFNMSACVF_FLOAT(temp_rv, alpha_i, x_v1, 4);
133+
temp_iv = VFMACCVF_FLOAT(temp_iv, alpha_r, x_v1, 4);
134+
VSEV_FLOAT(&temp_rr[0],temp_rv, 4 );
135+
VSEV_FLOAT(&temp_ii[0],temp_iv, 4 );
136+
137+
138+
#else
139+
x_v0 = VLSEV_FLOAT(&x[ix], inc_x2 * sizeof(FLOAT), 4);
140+
x_v1 = VLSEV_FLOAT(&x[ix+1], inc_x2 * sizeof(FLOAT), 4);
141+
temp_rv = VFMUL_VF_FLOAT(x_v0, alpha_r, 4);
142+
temp_iv = VFMUL_VF_FLOAT(x_v0, alpha_i, 4);
143+
temp_rv = VFMACCVF_FLOAT(temp_rv, alpha_i, x_v1, 4);
144+
temp_iv = VFNMSACVF_FLOAT(temp_iv, alpha_r, x_v1, 4);
145+
VSEV_FLOAT(&temp_rr[0],temp_rv, 4 );
146+
VSEV_FLOAT(&temp_ii[0],temp_iv, 4 );
147+
148+
#endif
149+
150+
va0 = VLSEV_FLOAT(&a_ptr[j], stride_a, gvl);
151+
va1 = VLSEV_FLOAT(&a_ptr[j+1], stride_a, gvl);
152+
va2 = VLSEV_FLOAT(&a_ptr[j+lda2], stride_a, gvl);
153+
va3 = VLSEV_FLOAT(&a_ptr[j+lda2+1], stride_a, gvl);
154+
va4 = VLSEV_FLOAT(&a_ptr[j+lda2*2], stride_a, gvl);
155+
va5 = VLSEV_FLOAT(&a_ptr[j+lda2*2+1], stride_a, gvl);
156+
va6 = VLSEV_FLOAT(&a_ptr[j+lda2*3], stride_a, gvl);
157+
va7 = VLSEV_FLOAT(&a_ptr[j+lda2*3+1], stride_a, gvl);
158+
159+
160+
#if !defined(CONJ)
161+
#if !defined(XCONJ)
162+
vy0 = VFMACCVF_FLOAT(vy0, temp_rr[0], va0, gvl);
163+
vy0 = VFNMSACVF_FLOAT(vy0, temp_ii[0], va1, gvl);
164+
vy1 = VFMACCVF_FLOAT(vy1, temp_rr[0], va1, gvl);
165+
vy1 = VFMACCVF_FLOAT(vy1, temp_ii[0], va0, gvl);
166+
167+
vy0 = VFMACCVF_FLOAT(vy0, temp_rr[1], va2, gvl);
168+
vy0 = VFNMSACVF_FLOAT(vy0, temp_ii[1], va3, gvl);
169+
vy1 = VFMACCVF_FLOAT(vy1, temp_rr[1], va3, gvl);
170+
vy1 = VFMACCVF_FLOAT(vy1, temp_ii[1], va2, gvl);
171+
172+
vy0 = VFMACCVF_FLOAT(vy0, temp_rr[2], va4, gvl);
173+
vy0 = VFNMSACVF_FLOAT(vy0, temp_ii[2], va5, gvl);
174+
vy1 = VFMACCVF_FLOAT(vy1, temp_rr[2], va5, gvl);
175+
vy1 = VFMACCVF_FLOAT(vy1, temp_ii[2], va4, gvl);
176+
177+
vy0 = VFMACCVF_FLOAT(vy0, temp_rr[3], va6, gvl);
178+
vy0 = VFNMSACVF_FLOAT(vy0, temp_ii[3], va7, gvl);
179+
vy1 = VFMACCVF_FLOAT(vy1, temp_rr[3], va7, gvl);
180+
vy1 = VFMACCVF_FLOAT(vy1, temp_ii[3], va6, gvl);
181+
182+
183+
#else
184+
185+
vy0 = VFMACCVF_FLOAT(vy0, temp_rr[0], va0, gvl);
186+
vy0 = VFMACCVF_FLOAT(vy0, temp_ii[0], va1, gvl);
187+
vy1 = VFMACCVF_FLOAT(vy1, temp_rr[0], va1, gvl);
188+
vy1 = VFNMSACVF_FLOAT(vy1, temp_ii[0], va0, gvl);
189+
190+
vy0 = VFMACCVF_FLOAT(vy0, temp_rr[1], va2, gvl);
191+
vy0 = VFMACCVF_FLOAT(vy0, temp_ii[1], va3, gvl);
192+
vy1 = VFMACCVF_FLOAT(vy1, temp_rr[1], va3, gvl);
193+
vy1 = VFNMSACVF_FLOAT(vy1, temp_ii[1], va2, gvl);
194+
195+
vy0 = VFMACCVF_FLOAT(vy0, temp_rr[2], va4, gvl);
196+
vy0 = VFMACCVF_FLOAT(vy0, temp_ii[2], va5, gvl);
197+
vy1 = VFMACCVF_FLOAT(vy1, temp_rr[2], va5, gvl);
198+
vy1 = VFNMSACVF_FLOAT(vy1, temp_ii[2], va4, gvl);
199+
200+
vy0 = VFMACCVF_FLOAT(vy0, temp_rr[3], va6, gvl);
201+
vy0 = VFMACCVF_FLOAT(vy0, temp_ii[3], va7, gvl);
202+
vy1 = VFMACCVF_FLOAT(vy1, temp_rr[3], va7, gvl);
203+
vy1 = VFNMSACVF_FLOAT(vy1, temp_ii[3], va6, gvl);
204+
205+
206+
#endif
207+
208+
#else
209+
210+
#if !defined(XCONJ)
211+
vy0 = VFMACCVF_FLOAT(vy0, temp_rr[0], va0, gvl);
212+
vy0 = VFMACCVF_FLOAT(vy0, temp_ii[0], va1, gvl);
213+
vy1 = VFNMSACVF_FLOAT(vy1, temp_rr[0], va1, gvl);
214+
vy1 = VFMACCVF_FLOAT(vy1, temp_ii[0], va0, gvl);
215+
216+
vy0 = VFMACCVF_FLOAT(vy0, temp_rr[1], va2, gvl);
217+
vy0 = VFMACCVF_FLOAT(vy0, temp_ii[1], va3, gvl);
218+
vy1 = VFNMSACVF_FLOAT(vy1, temp_rr[1], va3, gvl);
219+
vy1 = VFMACCVF_FLOAT(vy1, temp_ii[1], va2, gvl);
220+
221+
vy0 = VFMACCVF_FLOAT(vy0, temp_rr[2], va4, gvl);
222+
vy0 = VFMACCVF_FLOAT(vy0, temp_ii[2], va5, gvl);
223+
vy1 = VFNMSACVF_FLOAT(vy1, temp_rr[2], va5, gvl);
224+
vy1 = VFMACCVF_FLOAT(vy1, temp_ii[2], va4, gvl);
225+
226+
vy0 = VFMACCVF_FLOAT(vy0, temp_rr[3], va6, gvl);
227+
vy0 = VFMACCVF_FLOAT(vy0, temp_ii[3], va7, gvl);
228+
vy1 = VFNMSACVF_FLOAT(vy1, temp_rr[3], va7, gvl);
229+
vy1 = VFMACCVF_FLOAT(vy1, temp_ii[3], va6, gvl);
230+
231+
232+
#else
233+
vy0 = VFMACCVF_FLOAT(vy0, temp_rr[0], va0, gvl);
234+
vy0 = VFNMSACVF_FLOAT(vy0, temp_ii[0], va1, gvl);
235+
vy1 = VFNMSACVF_FLOAT(vy1, temp_rr[0], va1, gvl);
236+
vy1 = VFNMSACVF_FLOAT(vy1, temp_ii[0], va0, gvl);
237+
238+
vy0 = VFMACCVF_FLOAT(vy0, temp_rr[1], va2, gvl);
239+
vy0 = VFNMSACVF_FLOAT(vy0, temp_ii[1], va3, gvl);
240+
vy1 = VFNMSACVF_FLOAT(vy1, temp_rr[1], va3, gvl);
241+
vy1 = VFNMSACVF_FLOAT(vy1, temp_ii[1], va2, gvl);
242+
243+
vy0 = VFMACCVF_FLOAT(vy0, temp_rr[2], va4, gvl);
244+
vy0 = VFNMSACVF_FLOAT(vy0, temp_ii[2], va5, gvl);
245+
vy1 = VFNMSACVF_FLOAT(vy1, temp_rr[2], va5, gvl);
246+
vy1 = VFNMSACVF_FLOAT(vy1, temp_ii[2], va4, gvl);
247+
248+
vy0 = VFMACCVF_FLOAT(vy0, temp_rr[3], va6, gvl);
249+
vy0 = VFNMSACVF_FLOAT(vy0, temp_ii[3], va7, gvl);
250+
vy1 = VFNMSACVF_FLOAT(vy1, temp_rr[3], va7, gvl);
251+
vy1 = VFNMSACVF_FLOAT(vy1, temp_ii[3], va6, gvl);
252+
253+
#endif
254+
255+
#endif
256+
a_ptr += lda2 * 4;
257+
ix += inc_x2 * 4;
258+
}
259+
260+
111261
VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
112262
VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl);
113263
j += gvl * 2;
@@ -171,3 +321,4 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
171321
}
172322

173323

324+

0 commit comments

Comments
 (0)