@@ -27,185 +27,223 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
27
28
28
#include "common.h"
29
29
30
- #if defined(DOUBLE )
31
- #define VSETVL __riscv_vsetvl_e64m4
32
- #define FLOAT_V_T vfloat64m4_t
33
- #define FLOAT_V_T_M1 vfloat64m1_t
34
- #define VLEV_FLOAT __riscv_vle64_v_f64m4
35
- #define VLSEV_FLOAT __riscv_vlse64_v_f64m4
36
- #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4
37
- #define VFMVSF_FLOAT __riscv_vfmv_s_f_f64m4
38
- #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
39
- #define MASK_T vbool16_t
40
- #define VFABS __riscv_vfabs_v_f64m4
41
- #define VMFNE __riscv_vmfne_vf_f64m4_b16
42
- #define VMFGT __riscv_vmfgt_vv_f64m4_b16
43
- #define VMFEQ __riscv_vmfeq_vf_f64m4_b16
44
- #define VCPOP __riscv_vcpop_m_b16
45
- #define VFREDMAX __riscv_vfredmax_vs_f64m4_f64m1
46
- #define VFREDMIN __riscv_vfredmin_vs_f64m4_f64m1
47
- #define VFIRST __riscv_vfirst_m_b16
48
- #define VRGATHER __riscv_vrgather_vx_f64m4
49
- #define VFDIV __riscv_vfdiv_vv_f64m4
50
- #define VFDIV_M __riscv_vfdiv_vv_f64m4_mu
51
- #define VFMUL __riscv_vfmul_vv_f64m4
52
- #define VFMUL_M __riscv_vfmul_vv_f64m4_mu
53
- #define VFMACC __riscv_vfmacc_vv_f64m4
54
- #define VFMACC_M __riscv_vfmacc_vv_f64m4_mu
55
- #define VMSBF __riscv_vmsbf_m_b16
56
- #define VMSOF __riscv_vmsof_m_b16
57
- #define VMAND __riscv_vmand_mm_b16
58
- #define VMANDN __riscv_vmand_mm_b16
59
- #define VFREDSUM __riscv_vfredusum_vs_f64m4_f64m1
60
- #define VMERGE __riscv_vmerge_vvm_f64m4
61
- #define VSEV_FLOAT __riscv_vse64_v_f64m4
62
- #define EXTRACT_FLOAT0_V (v ) __riscv_vfmv_f_s_f64m4_f64(v)
63
- #define ABS fabs
64
- #else
65
- #define VSETVL __riscv_vsetvl_e32m4
30
+ #if !defined(DOUBLE )
31
+ #define VSETVL (n ) __riscv_vsetvl_e32m4(n)
32
+ #define VSETVL_MAX __riscv_vsetvlmax_e32m4()
66
33
#define FLOAT_V_T vfloat32m4_t
67
34
#define FLOAT_V_T_M1 vfloat32m1_t
35
+ #define MASK_T vbool8_t
68
36
#define VLEV_FLOAT __riscv_vle32_v_f32m4
69
37
#define VLSEV_FLOAT __riscv_vlse32_v_f32m4
38
+ #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1_tu
39
+ #define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu
70
40
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4
71
- #define VFMVSF_FLOAT __riscv_vfmv_s_f_f32m4
72
41
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
73
- #define MASK_T vbool8_t
74
- #define VFABS __riscv_vfabs_v_f32m4
75
- #define VMFNE __riscv_vmfne_vf_f32m4_b8
76
- #define VMFGT __riscv_vmfgt_vv_f32m4_b8
77
- #define VMFEQ __riscv_vmfeq_vf_f32m4_b8
78
- #define VCPOP __riscv_vcpop_m_b8
79
- #define VFREDMAX __riscv_vfredmax_vs_f32m4_f32m1
80
- #define VFREDMIN __riscv_vfredmin_vs_f32m4_f32m1
81
- #define VFIRST __riscv_vfirst_m_b8
82
- #define VRGATHER __riscv_vrgather_vx_f32m4
83
- #define VFDIV __riscv_vfdiv_vv_f32m4
84
- #define VFDIV_M __riscv_vfdiv_vv_f32m4_mu
85
- #define VFMUL __riscv_vfmul_vv_f32m4
86
- #define VFMUL_M __riscv_vfmul_vv_f32m4_mu
87
- #define VFMACC __riscv_vfmacc_vv_f32m4
88
- #define VFMACC_M __riscv_vfmacc_vv_f32m4_mu
89
- #define VMSBF __riscv_vmsbf_m_b8
90
- #define VMSOF __riscv_vmsof_m_b8
91
- #define VMAND __riscv_vmand_mm_b8
92
- #define VMANDN __riscv_vmand_mm_b8
93
- #define VFREDSUM __riscv_vfredusum_vs_f32m4_f32m1
94
- #define VMERGE __riscv_vmerge_vvm_f32m4
95
- #define VSEV_FLOAT __riscv_vse32_v_f32m4
96
- #define EXTRACT_FLOAT0_V (v ) __riscv_vfmv_f_s_f32m4_f32(v)
42
+ #define VMFIRSTM __riscv_vfirst_m_b8
43
+ #define VFREDMAXVS_FLOAT_TU __riscv_vfredmax_vs_f32m4_f32m1_tu
44
+ #define VFMVFS_FLOAT __riscv_vfmv_f_s_f32m1_f32
45
+ #define VMFGTVF_FLOAT __riscv_vmfgt_vf_f32m4_b8
46
+ #define VFDIVVF_FLOAT __riscv_vfdiv_vf_f32m4
47
+ #define VFABSV_FLOAT __riscv_vfabs_v_f32m4
97
48
#define ABS fabsf
49
+ #else
50
+ #define VSETVL (n ) __riscv_vsetvl_e64m4(n)
51
+ #define VSETVL_MAX __riscv_vsetvlmax_e64m4()
52
+ #define FLOAT_V_T vfloat64m4_t
53
+ #define FLOAT_V_T_M1 vfloat64m1_t
54
+ #define MASK_T vbool16_t
55
+ #define VLEV_FLOAT __riscv_vle64_v_f64m4
56
+ #define VLSEV_FLOAT __riscv_vlse64_v_f64m4
57
+ #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1_tu
58
+ #define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu
59
+ #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4
60
+ #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
61
+ #define VMFIRSTM __riscv_vfirst_m_b16
62
+ #define VFREDMAXVS_FLOAT_TU __riscv_vfredmax_vs_f64m4_f64m1_tu
63
+ #define VFMVFS_FLOAT __riscv_vfmv_f_s_f64m1_f64
64
+ #define VMFGTVF_FLOAT __riscv_vmfgt_vf_f64m4_b16
65
+ #define VFDIVVF_FLOAT __riscv_vfdiv_vf_f64m4
66
+ #define VFABSV_FLOAT __riscv_vfabs_v_f64m4
67
+ #define ABS fabs
98
68
#endif
99
69
100
70
FLOAT CNAME (BLASLONG n , FLOAT * x , BLASLONG inc_x )
101
71
{
102
- BLASLONG i = 0 ;
103
-
104
- if (n <= 0 || inc_x == 0 ) return (0.0 );
105
- if (n == 1 ) return (ABS (x [0 ]));
106
-
107
- unsigned int gvl = 0 ;
108
-
109
- MASK_T nonzero_mask ;
110
- MASK_T scale_mask ;
111
-
112
- gvl = VSETVL (n );
113
- FLOAT_V_T v0 ;
114
- FLOAT_V_T v_ssq = VFMVVF_FLOAT (0 , gvl );
115
- FLOAT_V_T v_scale = VFMVVF_FLOAT (0 , gvl );
116
-
117
- FLOAT scale = 0 ;
118
- FLOAT ssq = 0 ;
119
- unsigned int stride_x = inc_x * sizeof (FLOAT );
120
- int idx = 0 ;
121
-
122
- if ( n >= gvl && inc_x > 0 ) // don't pay overheads if we're not doing useful work
123
- {
124
- for (i = 0 ; i < n /gvl ; i ++ ){
125
- v0 = VLSEV_FLOAT ( & x [idx ], stride_x , gvl );
126
- nonzero_mask = VMFNE ( v0 , 0 , gvl );
127
- v0 = VFABS ( v0 , gvl );
128
- scale_mask = VMFGT ( v0 , v_scale , gvl );
129
-
130
- // assume scale changes are relatively infrequent
131
-
132
- // unclear if the vcpop+branch is actually a win
133
- // since the operations being skipped are predicated anyway
134
- // need profiling to confirm
135
- if ( VCPOP (scale_mask , gvl ) )
136
- {
137
- v_scale = VFDIV_M ( scale_mask , v_scale , v_scale , v0 , gvl );
138
- v_scale = VFMUL_M ( scale_mask , v_scale , v_scale , v_scale , gvl );
139
- v_ssq = VFMUL_M ( scale_mask , v_ssq , v_ssq , v_scale , gvl );
140
- v_scale = VMERGE ( v_scale , v0 , scale_mask , gvl );
141
- }
142
- v0 = VFDIV_M ( nonzero_mask , v0 , v0 , v_scale , gvl );
143
- v_ssq = VFMACC_M ( nonzero_mask , v_ssq , v0 , v0 , gvl );
144
- idx += inc_x * gvl ;
145
- }
146
-
147
- // we have gvl elements which we accumulated independently, with independent scales
148
- // we need to combine these
149
- // naive sort so we process small values first to avoid losing information
150
- // could use vector sort extensions where available, but we're dealing with gvl elts at most
151
-
152
- FLOAT * out_ssq = alloca (gvl * sizeof (FLOAT ));
153
- FLOAT * out_scale = alloca (gvl * sizeof (FLOAT ));
154
- VSEV_FLOAT ( out_ssq , v_ssq , gvl );
155
- VSEV_FLOAT ( out_scale , v_scale , gvl );
156
- for ( int a = 0 ; a < (gvl - 1 ); ++ a )
157
- {
158
- int smallest = a ;
159
- for ( size_t b = a + 1 ; b < gvl ; ++ b )
160
- if ( out_scale [b ] < out_scale [smallest ] )
161
- smallest = b ;
162
- if ( smallest != a )
163
- {
164
- FLOAT tmp1 = out_ssq [a ];
165
- FLOAT tmp2 = out_scale [a ];
166
- out_ssq [a ] = out_ssq [smallest ];
167
- out_scale [a ] = out_scale [smallest ];
168
- out_ssq [smallest ] = tmp1 ;
169
- out_scale [smallest ] = tmp2 ;
170
- }
171
- }
172
-
173
- int a = 0 ;
174
- while ( a < gvl && out_scale [a ] == 0 )
175
- ++ a ;
176
-
177
- if ( a < gvl )
178
- {
179
- ssq = out_ssq [a ];
180
- scale = out_scale [a ];
181
- ++ a ;
182
- for ( ; a < gvl ; ++ a )
183
- {
184
- ssq = ssq * ( scale / out_scale [a ] ) * ( scale / out_scale [a ] ) + out_ssq [a ];
185
- scale = out_scale [a ];
186
- }
187
- }
188
- }
189
-
190
- //finish any tail using scalar ops
191
- i *=gvl * inc_x ;
192
- n *=inc_x ;
72
+ if (n <= 0 || inc_x == 0 ) return (0.0 );
73
+ if ( n == 1 ) return ( ABS (x [0 ]) );
74
+
75
+ BLASLONG i = 0 , j = 0 ;
76
+ FLOAT scale = 0.0 , ssq = 0.0 ;
77
+
78
+ if ( inc_x > 0 ){
79
+ FLOAT_V_T vr , v0 , v_zero ;
80
+ unsigned int gvl = 0 ;
81
+ FLOAT_V_T_M1 v_res , v_z0 ;
82
+ gvl = VSETVL_MAX ;
83
+ v_res = VFMVVF_FLOAT_M1 (0 , gvl );
84
+ v_z0 = VFMVVF_FLOAT_M1 (0 , gvl );
85
+ MASK_T mask ;
86
+ BLASLONG index = 0 ;
87
+
88
+ if (inc_x == 1 ) {
89
+ gvl = VSETVL (n );
90
+ vr = VFMVVF_FLOAT (0 , gvl );
91
+ v_zero = VFMVVF_FLOAT (0 , gvl );
92
+ for (i = 0 , j = 0 ; i < n / gvl ; i ++ ) {
93
+ v0 = VLEV_FLOAT (& x [j ], gvl );
94
+ // fabs(vector)
95
+ v0 = VFABSV_FLOAT (v0 , gvl );
96
+ // if scale change
97
+ mask = VMFGTVF_FLOAT (v0 , scale , gvl );
98
+ index = VMFIRSTM (mask , gvl );
99
+ if (index == -1 ) { // no elements greater than scale
100
+ if (scale != 0.0 ) {
101
+ v0 = VFDIVVF_FLOAT (v0 , scale , gvl );
102
+ vr = VFMACCVV_FLOAT_TU (vr , v0 , v0 , gvl );
103
+ }
104
+ }
105
+ else { // found greater element
106
+ // ssq in vector vr: vr[0]
107
+ v_res = VFREDSUM_FLOAT (v_res , vr , v_z0 , gvl );
108
+ // total ssq before current vector
109
+ ssq += VFMVFS_FLOAT (v_res );
110
+ // find max
111
+ v_res = VFREDMAXVS_FLOAT_TU (v_res , v0 , v_z0 , gvl );
112
+ // update ssq before max_index
113
+ ssq = ssq * (scale / VFMVFS_FLOAT (v_res )) * (scale / VFMVFS_FLOAT (v_res ));
114
+ // update scale
115
+ scale = VFMVFS_FLOAT (v_res );
116
+ // ssq in vector vr
117
+ v0 = VFDIVVF_FLOAT (v0 , scale , gvl );
118
+ vr = VFMACCVV_FLOAT_TU (v_zero , v0 , v0 , gvl );
119
+ }
120
+ j += gvl ;
121
+ }
122
+ // ssq in vector vr: vr[0]
123
+ v_res = VFREDSUM_FLOAT (v_res , vr , v_z0 , gvl );
124
+ // total ssq now
125
+ ssq += VFMVFS_FLOAT (v_res );
126
+
127
+ // tail processing
128
+ if (j < n ){
129
+ gvl = VSETVL (n - j );
130
+ v0 = VLEV_FLOAT (& x [j ], gvl );
131
+ // fabs(vector)
132
+ v0 = VFABSV_FLOAT (v0 , gvl );
133
+ // if scale change
134
+ mask = VMFGTVF_FLOAT (v0 , scale , gvl );
135
+ index = VMFIRSTM (mask , gvl );
136
+ if (index == -1 ) { // no elements greater than scale
137
+ if (scale != 0.0 )
138
+ v0 = VFDIVVF_FLOAT (v0 , scale , gvl );
139
+ } else { // found greater element
140
+ // find max
141
+ v_res = VFREDMAXVS_FLOAT_TU (v_res , v0 , v_z0 , gvl );
142
+ // update ssq before max_index
143
+ ssq = ssq * (scale / VFMVFS_FLOAT (v_res ))* (scale / VFMVFS_FLOAT (v_res ));
144
+ // update scale
145
+ scale = VFMVFS_FLOAT (v_res );
146
+ v0 = VFDIVVF_FLOAT (v0 , scale , gvl );
147
+ }
148
+ vr = VFMACCVV_FLOAT_TU (v_zero , v0 , v0 , gvl );
149
+ // ssq in vector vr: vr[0]
150
+ v_res = VFREDSUM_FLOAT (v_res , vr , v_z0 , gvl );
151
+ // total ssq now
152
+ ssq += VFMVFS_FLOAT (v_res );
153
+ }
154
+ }
155
+ else {
156
+ gvl = VSETVL (n );
157
+ vr = VFMVVF_FLOAT (0 , gvl );
158
+ v_zero = VFMVVF_FLOAT (0 , gvl );
159
+ unsigned int stride_x = inc_x * sizeof (FLOAT );
160
+ int idx = 0 , inc_v = inc_x * gvl ;
161
+ for (i = 0 , j = 0 ; i < n / gvl ; i ++ ) {
162
+ v0 = VLSEV_FLOAT (& x [idx ], stride_x , gvl );
163
+ // fabs(vector)
164
+ v0 = VFABSV_FLOAT (v0 , gvl );
165
+ // if scale change
166
+ mask = VMFGTVF_FLOAT (v0 , scale , gvl );
167
+ index = VMFIRSTM (mask , gvl );
168
+ if (index == -1 ) {// no elements greater than scale
169
+ if (scale != 0.0 ){
170
+ v0 = VFDIVVF_FLOAT (v0 , scale , gvl );
171
+ vr = VFMACCVV_FLOAT_TU (vr , v0 , v0 , gvl );
172
+ }
173
+ }
174
+ else { // found greater element
175
+ // ssq in vector vr: vr[0]
176
+ v_res = VFREDSUM_FLOAT (v_res , vr , v_z0 , gvl );
177
+ // total ssq before current vector
178
+ ssq += VFMVFS_FLOAT (v_res );
179
+ // find max
180
+ v_res = VFREDMAXVS_FLOAT_TU (v_res , v0 , v_z0 , gvl );
181
+ // update ssq before max_index
182
+ ssq = ssq * (scale / VFMVFS_FLOAT (v_res ))* (scale / VFMVFS_FLOAT (v_res ));
183
+ // update scale
184
+ scale = VFMVFS_FLOAT (v_res );
185
+ // ssq in vector vr
186
+ v0 = VFDIVVF_FLOAT (v0 , scale , gvl );
187
+ vr = VFMACCVV_FLOAT_TU (v_zero , v0 , v0 , gvl );
188
+ }
189
+ j += gvl ;
190
+ idx += inc_v ;
191
+ }
192
+ // ssq in vector vr: vr[0]
193
+ v_res = VFREDSUM_FLOAT (v_res , vr , v_z0 , gvl );
194
+ // total ssq now
195
+ ssq += VFMVFS_FLOAT (v_res );
196
+
197
+ // tail processing
198
+ if (j < n ) {
199
+ gvl = VSETVL (n - j );
200
+ v0 = VLSEV_FLOAT (& x [idx ], stride_x , gvl );
201
+ // fabs(vector)
202
+ v0 = VFABSV_FLOAT (v0 , gvl );
203
+ // if scale change
204
+ mask = VMFGTVF_FLOAT (v0 , scale , gvl );
205
+ index = VMFIRSTM (mask , gvl );
206
+ if (index == -1 ) { // no elements greater than scale
207
+ if (scale != 0.0 ) {
208
+ v0 = VFDIVVF_FLOAT (v0 , scale , gvl );
209
+ vr = VFMACCVV_FLOAT_TU (v_zero , v0 , v0 , gvl );
210
+ }
211
+ }
212
+ else { // found greater element
213
+ // find max
214
+ v_res = VFREDMAXVS_FLOAT_TU (v_res , v0 , v_z0 , gvl );
215
+ // update ssq before max_index
216
+ ssq = ssq * (scale / VFMVFS_FLOAT (v_res ))* (scale / VFMVFS_FLOAT (v_res ));
217
+ // update scale
218
+ scale = VFMVFS_FLOAT (v_res );
219
+ v0 = VFDIVVF_FLOAT (v0 , scale , gvl );
220
+ vr = VFMACCVV_FLOAT_TU (v_zero , v0 , v0 , gvl );
221
+ }
222
+ // ssq in vector vr: vr[0]
223
+ v_res = VFREDSUM_FLOAT (v_res , vr , v_z0 , gvl );
224
+ // total ssq now
225
+ ssq += VFMVFS_FLOAT (v_res );
226
+ }
227
+ }
228
+ }
229
+ else {
230
+ // using scalar ops when inc_x < 0
231
+ n *= inc_x ;
193
232
while (abs (i ) < abs (n )){
194
- if ( x [i ] != 0.0 ){
195
- FLOAT absxi = ABS ( x [i ] );
196
- if ( scale < absxi ){
197
- ssq = 1 + ssq * ( scale / absxi ) * ( scale / absxi );
198
- scale = absxi ;
199
- }
200
- else {
201
- ssq += ( absxi /scale ) * ( absxi /scale );
202
- }
203
-
204
- }
205
-
206
- i += inc_x ;
233
+ if ( x [i ] != 0.0 ){
234
+ FLOAT absxi = ABS ( x [i ] );
235
+ if ( scale < absxi ){
236
+ ssq = 1 + ssq * ( scale / absxi ) * ( scale / absxi );
237
+ scale = absxi ;
238
+ }
239
+ else {
240
+ ssq += ( absxi /scale ) * ( absxi /scale );
241
+ }
242
+
243
+ }
244
+ i += inc_x ;
207
245
}
208
-
246
+ }
209
247
return (scale * sqrt (ssq ));
210
248
}
211
249
0 commit comments