@@ -28,35 +28,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28
28
#include "common.h"
29
29
30
30
#if !defined(DOUBLE )
31
- #define VSETVL (n ) __riscv_vsetvl_e32m1(n)
32
- #define FLOAT_V_T vfloat32m1_t
33
- #define FLOAT_VX2_T vfloat32m1x2_t
34
- #define FLOAT_VX4_T vfloat32m1x4_t
35
- #define FLOAT_VX8_T vfloat32m1x8_t
36
- #define VLEV_FLOAT __riscv_vle32_v_f32m1
37
- #define VSEV_FLOAT __riscv_vse32_v_f32m1
38
- #define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m1x2
39
- #define VLSSEG4_FLOAT __riscv_vlsseg4e32_v_f32m1x4
40
- #define VLSSEG8_FLOAT __riscv_vlsseg8e32_v_f32m1x8
41
- #define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1x2
42
- #define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1x4
43
- #define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1x8
31
+ #define FLOAT_V_T vfloat32m2_t
32
+ #define FLOAT_V_T_HALF vfloat32m1_t
33
+ #define VLEV_FLOAT __riscv_vle32_v_f32m2
34
+ #define VLEV_FLOAT_HALF __riscv_vle32_v_f32m1
35
+ #define VSEV_FLOAT __riscv_vse32_v_f32m2
36
+ #define VSEV_FLOAT_HALF __riscv_vse32_v_f32m1
44
37
#else
45
- #define VSETVL (n ) __riscv_vsetvl_e64m1(n)
46
- #define FLOAT_V_T vfloat64m1_t
47
- #define FLOAT_VX2_T vfloat64m1x2_t
48
- #define FLOAT_VX4_T vfloat64m1x4_t
49
- #define FLOAT_VX8_T vfloat64m1x8_t
50
- #define VLEV_FLOAT __riscv_vle64_v_f64m1
51
- #define VSEV_FLOAT __riscv_vse64_v_f64m1
52
- #define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m1x2
53
- #define VLSSEG4_FLOAT __riscv_vlsseg4e64_v_f64m1x4
54
- #define VLSSEG8_FLOAT __riscv_vlsseg8e64_v_f64m1x8
55
- #define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1x2
56
- #define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1x4
57
- #define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1x8
38
+ #define FLOAT_V_T vfloat64m4_t
39
+ #define FLOAT_V_T_HALF vfloat64m2_t
40
+ #define VLEV_FLOAT __riscv_vle64_v_f64m4
41
+ #define VLEV_FLOAT_HALF __riscv_vle64_v_f64m2
42
+ #define VSEV_FLOAT __riscv_vse64_v_f64m4
43
+ #define VSEV_FLOAT_HALF __riscv_vse64_v_f64m2
58
44
#endif
59
45
46
+
60
47
int CNAME (BLASLONG m , BLASLONG n , FLOAT * a , BLASLONG lda , FLOAT * b ){
61
48
62
49
BLASLONG i , j ;
@@ -67,9 +54,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
67
54
IFLOAT * boffset , * boffset1 , * boffset2 , * boffset3 ;
68
55
69
56
FLOAT_V_T v0 ;
70
- FLOAT_VX2_T vx2 ;
71
- FLOAT_VX4_T vx4 ;
72
- FLOAT_VX8_T vx8 ;
57
+ FLOAT_V_T_HALF v1 ;
73
58
74
59
size_t vl ;
75
60
@@ -80,86 +65,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
80
65
boffset2 = b + 2 * m * (n & ~3 );
81
66
boffset3 = b + 2 * m * (n & ~1 );
82
67
83
- for (j = (m >> 2 ); j > 0 ; j -- ) {
84
-
85
- aoffset1 = aoffset ;
86
- aoffset += 8 * lda ;
87
-
88
- boffset1 = boffset ;
89
- boffset += 32 ;
90
-
91
- for (i = (n >> 2 ); i > 0 ; i -- ) {
92
- vl = 4 ;
93
-
94
- vx8 = VLSSEG8_FLOAT (aoffset1 , lda * sizeof (FLOAT ) * 2 , vl );
95
- VSSEG8_FLOAT (boffset1 , vx8 , vl );
96
-
97
- aoffset1 += 8 ;
98
- boffset1 += m * 8 ;
99
- }
100
-
101
- if (n & 2 ) {
102
- vl = 4 ;
103
-
104
- vx4 = VLSSEG4_FLOAT (aoffset1 , lda * sizeof (FLOAT ) * 2 , vl );
105
- VSSEG4_FLOAT (boffset2 , vx4 , vl );
106
-
107
- aoffset1 += 4 ;
108
- boffset2 += 16 ;
109
- }
110
-
111
- if (n & 1 ) {
112
- vl = 4 ;
113
-
114
- vx2 = VLSSEG2_FLOAT (aoffset1 , lda * sizeof (FLOAT ) * 2 , vl );
115
- VSSEG2_FLOAT (boffset3 , vx2 , vl );
116
-
117
- aoffset1 += 2 ;
118
- boffset3 += 8 ;
119
- }
120
- }
121
-
122
- if (m & 2 ) {
68
+ for (j = m ; j > 0 ; j -- ) {
123
69
aoffset1 = aoffset ;
124
- aoffset += 4 * lda ;
125
-
126
70
boffset1 = boffset ;
127
- boffset += 16 ;
128
-
129
- for (i = (n >> 2 ); i > 0 ; i -- ) {
130
- vl = 2 ;
131
-
132
- vx8 = VLSSEG8_FLOAT (aoffset1 , lda * sizeof (FLOAT ) * 2 , vl );
133
- VSSEG8_FLOAT (boffset1 , vx8 , vl );
134
-
135
- aoffset1 += 8 ;
136
- boffset1 += m * 8 ;
137
- }
138
-
139
- if (n & 2 ) {
140
- vl = 2 ;
141
-
142
- vx4 = VLSSEG4_FLOAT (aoffset1 , lda * sizeof (FLOAT ) * 2 , vl );
143
- VSSEG4_FLOAT (boffset2 , vx4 , vl );
144
-
145
- aoffset1 += 4 ;
146
- boffset2 += 8 ;
147
- }
148
-
149
- if (n & 1 ) {
150
- vl = 2 ;
151
71
152
- vx2 = VLSSEG2_FLOAT (aoffset1 , lda * sizeof (FLOAT ) * 2 , vl );
153
- VSSEG2_FLOAT (boffset3 , vx2 , vl );
154
-
155
- //aoffset1 += 2;
156
- boffset3 += 4 ;
157
- }
158
- }
159
-
160
- if (m & 1 ) {
161
- aoffset1 = aoffset ;
162
- boffset1 = boffset ;
72
+ aoffset += 2 * lda ;
73
+ boffset += 8 ;
163
74
164
75
for (i = (n >> 2 ); i > 0 ; i -- ) {
165
76
vl = 8 ;
@@ -174,16 +85,19 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
174
85
if (n & 2 ) {
175
86
vl = 4 ;
176
87
177
- v0 = VLEV_FLOAT (aoffset1 , vl );
178
- VSEV_FLOAT (boffset2 , v0 , vl );
88
+ v1 = VLEV_FLOAT_HALF (aoffset1 , vl );
89
+ VSEV_FLOAT_HALF (boffset2 , v1 , vl );
179
90
180
91
aoffset1 += 4 ;
181
- // boffset2 += 4;
92
+ boffset2 += 4 ;
182
93
}
183
94
184
95
if (n & 1 ) {
185
- * (boffset3 ) = * (aoffset1 );
186
- * (boffset3 + 1 ) = * (aoffset1 + 1 );
96
+ * (boffset3 ) = * (aoffset1 );
97
+ * (boffset3 + 1 ) = * (aoffset1 + 1 );
98
+
99
+ aoffset1 += 2 ;
100
+ boffset3 += 2 ;
187
101
}
188
102
}
189
103
0 commit comments