@@ -30,27 +30,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
30
#if !defined(DOUBLE )
31
31
#define VSETVL (n ) __riscv_vsetvl_e32m1(n)
32
32
#define FLOAT_V_T vfloat32m1_t
33
+ #define FLOAT_VX2_T vfloat32m1x2_t
34
+ #define FLOAT_VX4_T vfloat32m1x4_t
35
+ #define FLOAT_VX8_T vfloat32m1x8_t
33
36
#define VLEV_FLOAT __riscv_vle32_v_f32m1
34
37
#define VLSEV_FLOAT __riscv_vlse32_v_f32m1
35
38
#define VSEV_FLOAT __riscv_vse32_v_f32m1
36
- #define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m1
37
- #define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1
38
- #define VLSSEG4_FLOAT __riscv_vlsseg4e32_v_f32m1
39
- #define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1
40
- #define VLSSEG8_FLOAT __riscv_vlsseg8e32_v_f32m1
41
- #define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1
39
+ #define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m1x2
40
+ #define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1x2
41
+ #define VLSSEG4_FLOAT __riscv_vlsseg4e32_v_f32m1x4
42
+ #define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1x4
43
+ #define VLSSEG8_FLOAT __riscv_vlsseg8e32_v_f32m1x8
44
+ #define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1x8
42
45
#else
43
46
#define VSETVL (n ) __riscv_vsetvl_e64m1(n)
44
47
#define FLOAT_V_T vfloat64m1_t
48
+ #define FLOAT_VX2_T vfloat64m1x2_t
49
+ #define FLOAT_VX4_T vfloat64m1x4_t
50
+ #define FLOAT_VX8_T vfloat64m1x8_t
45
51
#define VLEV_FLOAT __riscv_vle64_v_f64m1
46
52
#define VLSEV_FLOAT __riscv_vlse64_v_f64m1
47
53
#define VSEV_FLOAT __riscv_vse64_v_f64m1
48
- #define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m1
49
- #define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1
50
- #define VLSSEG4_FLOAT __riscv_vlsseg4e64_v_f64m1
51
- #define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1
52
- #define VLSSEG8_FLOAT __riscv_vlsseg8e64_v_f64m1
53
- #define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1
54
+ #define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m1x2
55
+ #define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1x2
56
+ #define VLSSEG4_FLOAT __riscv_vlsseg4e64_v_f64m1x4
57
+ #define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1x4
58
+ #define VLSSEG8_FLOAT __riscv_vlsseg8e64_v_f64m1x8
59
+ #define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1x8
54
60
#endif
55
61
56
62
int CNAME (BLASLONG m , BLASLONG n , IFLOAT * a , BLASLONG lda , IFLOAT * b )
@@ -62,7 +68,10 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
62
68
63
69
IFLOAT * boffset , * boffset1 , * boffset2 , * boffset3 , * boffset4 ;
64
70
65
- FLOAT_V_T v0 , v1 , v2 , v3 , v4 , v5 , v6 , v7 ;
71
+ FLOAT_V_T v0 ;
72
+ FLOAT_VX2_T vx2 ;
73
+ FLOAT_VX4_T vx4 ;
74
+ FLOAT_VX8_T vx8 ;
66
75
67
76
// fprintf(stderr, "gemm_tcopy_8 m=%ld n=%ld lda=%ld\n", m, n, lda);
68
77
@@ -83,8 +92,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
83
92
for (i = (n >> 3 ); i > 0 ; i -- ) {
84
93
size_t vl = 8 ;
85
94
86
- VLSSEG8_FLOAT ( & v0 , & v1 , & v2 , & v3 , & v4 , & v5 , & v6 , & v7 , aoffset1 , lda * sizeof (FLOAT ), vl );
87
- VSSEG8_FLOAT (boffset1 , v0 , v1 , v2 , v3 , v4 , v5 , v6 , v7 , vl );
95
+ vx8 = VLSSEG8_FLOAT ( aoffset1 , lda * sizeof (FLOAT ), vl );
96
+ VSSEG8_FLOAT (boffset1 , vx8 , vl );
88
97
89
98
aoffset1 += 8 ;
90
99
boffset1 += m * 8 ;
@@ -93,8 +102,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
93
102
if (n & 4 ) {
94
103
size_t vl = 8 ;
95
104
96
- VLSSEG4_FLOAT ( & v0 , & v1 , & v2 , & v3 , aoffset1 , lda * sizeof (FLOAT ), vl );
97
- VSSEG4_FLOAT (boffset2 , v0 , v1 , v2 , v3 , vl );
105
+ vx4 = VLSSEG4_FLOAT ( aoffset1 , lda * sizeof (FLOAT ), vl );
106
+ VSSEG4_FLOAT (boffset2 , vx4 , vl );
98
107
99
108
aoffset1 += 4 ;
100
109
boffset2 += 32 ;
@@ -103,8 +112,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
103
112
if (n & 2 ) {
104
113
size_t vl = 8 ;
105
114
106
- VLSSEG2_FLOAT ( & v0 , & v1 , aoffset1 , lda * sizeof (FLOAT ), vl );
107
- VSSEG2_FLOAT (boffset3 , v0 , v1 , vl );
115
+ vx2 = VLSSEG2_FLOAT ( aoffset1 , lda * sizeof (FLOAT ), vl );
116
+ VSSEG2_FLOAT (boffset3 , vx2 , vl );
108
117
109
118
aoffset1 += 2 ;
110
119
boffset3 += 16 ;
@@ -133,8 +142,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
133
142
for (i = (n >> 3 ); i > 0 ; i -- ) {
134
143
size_t vl = 4 ;
135
144
136
- VLSSEG8_FLOAT ( & v0 , & v1 , & v2 , & v3 , & v4 , & v5 , & v6 , & v7 , aoffset1 , lda * sizeof (FLOAT ), vl );
137
- VSSEG8_FLOAT (boffset1 , v0 , v1 , v2 , v3 , v4 , v5 , v6 , v7 , vl );
145
+ vx8 = VLSSEG8_FLOAT ( aoffset1 , lda * sizeof (FLOAT ), vl );
146
+ VSSEG8_FLOAT (boffset1 , vx8 , vl );
138
147
139
148
aoffset1 += 8 ;
140
149
boffset1 += m * 8 ;
@@ -143,8 +152,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
143
152
if (n & 4 ) {
144
153
size_t vl = 4 ;
145
154
146
- VLSSEG4_FLOAT ( & v0 , & v1 , & v2 , & v3 , aoffset1 , lda * sizeof (FLOAT ), vl );
147
- VSSEG4_FLOAT (boffset2 , v0 , v1 , v2 , v3 , vl );
155
+ vx4 = VLSSEG4_FLOAT ( aoffset1 , lda * sizeof (FLOAT ), vl );
156
+ VSSEG4_FLOAT (boffset2 , vx4 , vl );
148
157
149
158
aoffset1 += 4 ;
150
159
boffset2 += 16 ;
@@ -153,8 +162,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
153
162
if (n & 2 ) {
154
163
size_t vl = 4 ;
155
164
156
- VLSSEG2_FLOAT ( & v0 , & v1 , aoffset1 , lda * sizeof (FLOAT ), vl );
157
- VSSEG2_FLOAT (boffset3 , v0 , v1 , vl );
165
+ vx2 = VLSSEG2_FLOAT ( aoffset1 , lda * sizeof (FLOAT ), vl );
166
+ VSSEG2_FLOAT (boffset3 , vx2 , vl );
158
167
159
168
aoffset1 += 2 ;
160
169
boffset3 += 8 ;
@@ -181,8 +190,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
181
190
for (i = (n >> 3 ); i > 0 ; i -- ) {
182
191
size_t vl = 2 ;
183
192
184
- VLSSEG8_FLOAT ( & v0 , & v1 , & v2 , & v3 , & v4 , & v5 , & v6 , & v7 , aoffset1 , lda * sizeof (FLOAT ), vl );
185
- VSSEG8_FLOAT (boffset1 , v0 , v1 , v2 , v3 , v4 , v5 , v6 , v7 , vl );
193
+ vx8 = VLSSEG8_FLOAT ( aoffset1 , lda * sizeof (FLOAT ), vl );
194
+ VSSEG8_FLOAT (boffset1 , vx8 , vl );
186
195
187
196
aoffset1 += 8 ;
188
197
boffset1 += m * 8 ;
@@ -191,8 +200,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
191
200
if (n & 4 ) {
192
201
size_t vl = 2 ;
193
202
194
- VLSSEG4_FLOAT ( & v0 , & v1 , & v2 , & v3 , aoffset1 , lda * sizeof (FLOAT ), vl );
195
- VSSEG4_FLOAT (boffset2 , v0 , v1 , v2 , v3 , vl );
203
+ vx4 = VLSSEG4_FLOAT ( aoffset1 , lda * sizeof (FLOAT ), vl );
204
+ VSSEG4_FLOAT (boffset2 , vx4 , vl );
196
205
197
206
aoffset1 += 4 ;
198
207
boffset2 += 8 ;
@@ -201,8 +210,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
201
210
if (n & 2 ) {
202
211
size_t vl = 2 ;
203
212
204
- VLSSEG2_FLOAT ( & v0 , & v1 , aoffset1 , lda * sizeof (FLOAT ), vl );
205
- VSSEG2_FLOAT (boffset3 , v0 , v1 , vl );
213
+ vx2 = VLSSEG2_FLOAT ( aoffset1 , lda * sizeof (FLOAT ), vl );
214
+ VSSEG2_FLOAT (boffset3 , vx2 , vl );
206
215
207
216
aoffset1 += 2 ;
208
217
boffset3 += 4 ;
0 commit comments