Skip to content

Commit 0bea1cf

Browse files
author
tingbo.liao
committed
Optimize the zgemm_tcopy_4_rvv function to be compatible with the situations where the vector lengths(vlens) are 128 and 256.
Signed-off-by: tingbo.liao <tingbo.liao@starfivetech.com>
1 parent d00cc40 commit 0bea1cf

File tree

1 file changed

+25
-111
lines changed

1 file changed

+25
-111
lines changed

kernel/riscv64/zgemm_tcopy_4_rvv.c

Lines changed: 25 additions & 111 deletions
Original file line numberDiff line numberDiff line change
@@ -28,35 +28,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2828
#include "common.h"
2929

3030
#if !defined(DOUBLE)
31-
#define VSETVL(n) __riscv_vsetvl_e32m1(n)
32-
#define FLOAT_V_T vfloat32m1_t
33-
#define FLOAT_VX2_T vfloat32m1x2_t
34-
#define FLOAT_VX4_T vfloat32m1x4_t
35-
#define FLOAT_VX8_T vfloat32m1x8_t
36-
#define VLEV_FLOAT __riscv_vle32_v_f32m1
37-
#define VSEV_FLOAT __riscv_vse32_v_f32m1
38-
#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m1x2
39-
#define VLSSEG4_FLOAT __riscv_vlsseg4e32_v_f32m1x4
40-
#define VLSSEG8_FLOAT __riscv_vlsseg8e32_v_f32m1x8
41-
#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1x2
42-
#define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1x4
43-
#define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1x8
31+
#define FLOAT_V_T vfloat32m2_t
32+
#define FLOAT_V_T_HALF vfloat32m1_t
33+
#define VLEV_FLOAT __riscv_vle32_v_f32m2
34+
#define VLEV_FLOAT_HALF __riscv_vle32_v_f32m1
35+
#define VSEV_FLOAT __riscv_vse32_v_f32m2
36+
#define VSEV_FLOAT_HALF __riscv_vse32_v_f32m1
4437
#else
45-
#define VSETVL(n) __riscv_vsetvl_e64m1(n)
46-
#define FLOAT_V_T vfloat64m1_t
47-
#define FLOAT_VX2_T vfloat64m1x2_t
48-
#define FLOAT_VX4_T vfloat64m1x4_t
49-
#define FLOAT_VX8_T vfloat64m1x8_t
50-
#define VLEV_FLOAT __riscv_vle64_v_f64m1
51-
#define VSEV_FLOAT __riscv_vse64_v_f64m1
52-
#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m1x2
53-
#define VLSSEG4_FLOAT __riscv_vlsseg4e64_v_f64m1x4
54-
#define VLSSEG8_FLOAT __riscv_vlsseg8e64_v_f64m1x8
55-
#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1x2
56-
#define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1x4
57-
#define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1x8
38+
#define FLOAT_V_T vfloat64m4_t
39+
#define FLOAT_V_T_HALF vfloat64m2_t
40+
#define VLEV_FLOAT __riscv_vle64_v_f64m4
41+
#define VLEV_FLOAT_HALF __riscv_vle64_v_f64m2
42+
#define VSEV_FLOAT __riscv_vse64_v_f64m4
43+
#define VSEV_FLOAT_HALF __riscv_vse64_v_f64m2
5844
#endif
5945

46+
6047
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
6148

6249
BLASLONG i, j;
@@ -67,9 +54,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
6754
IFLOAT *boffset, *boffset1, *boffset2, *boffset3;
6855

6956
FLOAT_V_T v0;
70-
FLOAT_VX2_T vx2;
71-
FLOAT_VX4_T vx4;
72-
FLOAT_VX8_T vx8;
57+
FLOAT_V_T_HALF v1;
7358

7459
size_t vl;
7560

@@ -80,86 +65,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
8065
boffset2 = b + 2 * m * (n & ~3);
8166
boffset3 = b + 2 * m * (n & ~1);
8267

83-
for(j = (m >> 2); j > 0; j--) {
84-
85-
aoffset1 = aoffset;
86-
aoffset += 8 * lda;
87-
88-
boffset1 = boffset;
89-
boffset += 32;
90-
91-
for(i = (n >> 2); i > 0; i--) {
92-
vl = 4;
93-
94-
vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT) * 2, vl);
95-
VSSEG8_FLOAT(boffset1, vx8, vl);
96-
97-
aoffset1 += 8;
98-
boffset1 += m * 8;
99-
}
100-
101-
if (n & 2) {
102-
vl = 4;
103-
104-
vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT) * 2, vl);
105-
VSSEG4_FLOAT(boffset2, vx4, vl);
106-
107-
aoffset1 += 4;
108-
boffset2 += 16;
109-
}
110-
111-
if (n & 1) {
112-
vl = 4;
113-
114-
vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT) * 2, vl);
115-
VSSEG2_FLOAT(boffset3, vx2, vl);
116-
117-
aoffset1 += 2;
118-
boffset3 += 8;
119-
}
120-
}
121-
122-
if (m & 2) {
68+
for(j = m; j > 0; j--) {
12369
aoffset1 = aoffset;
124-
aoffset += 4 * lda;
125-
12670
boffset1 = boffset;
127-
boffset += 16;
128-
129-
for(i = (n >> 2); i > 0; i--) {
130-
vl = 2;
131-
132-
vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT) * 2, vl);
133-
VSSEG8_FLOAT(boffset1, vx8, vl);
134-
135-
aoffset1 += 8;
136-
boffset1 += m * 8;
137-
}
138-
139-
if (n & 2) {
140-
vl = 2;
141-
142-
vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT) * 2, vl);
143-
VSSEG4_FLOAT(boffset2, vx4, vl);
144-
145-
aoffset1 += 4;
146-
boffset2 += 8;
147-
}
148-
149-
if (n & 1) {
150-
vl = 2;
15171

152-
vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT) * 2, vl);
153-
VSSEG2_FLOAT(boffset3, vx2, vl);
154-
155-
//aoffset1 += 2;
156-
boffset3 += 4;
157-
}
158-
}
159-
160-
if (m & 1) {
161-
aoffset1 = aoffset;
162-
boffset1 = boffset;
72+
aoffset += 2 * lda;
73+
boffset += 8;
16374

16475
for(i = (n >> 2); i > 0; i--) {
16576
vl = 8;
@@ -174,16 +85,19 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
17485
if (n & 2) {
17586
vl = 4;
17687

177-
v0 = VLEV_FLOAT(aoffset1, vl);
178-
VSEV_FLOAT(boffset2, v0, vl);
88+
v1 = VLEV_FLOAT_HALF(aoffset1, vl);
89+
VSEV_FLOAT_HALF(boffset2, v1, vl);
17990

18091
aoffset1 += 4;
181-
//boffset2 += 4;
92+
boffset2 += 4;
18293
}
18394

18495
if (n & 1) {
185-
*(boffset3) = *(aoffset1);
186-
*(boffset3 + 1) = *(aoffset1 + 1);
96+
*(boffset3) = *(aoffset1);
97+
*(boffset3 + 1) = *(aoffset1 + 1);
98+
99+
aoffset1 += 2;
100+
boffset3 += 2;
187101
}
188102
}
189103

0 commit comments

Comments
 (0)