@@ -43,45 +43,79 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43
43
#define t2 $r13
44
44
#define t3 $r14
45
45
#define t4 $r15
46
-
47
- /* Don't change following FR unless you know the effects. */
48
46
#define VX0 $xr15
49
47
#define VX1 $xr16
50
48
#define VX2 $xr17
51
49
#define VX3 $xr18
52
50
#define VX4 $xr21
51
+ #define VX5 $xr22
52
+ /* Don't change following FR unless you know the effects. */
53
53
#define res1 $xr19
54
54
#define res2 $xr20
55
+ #define RCP $f2
56
+ #define VALPHA $xr3
57
+
58
+ // The optimization for snrm2 cannot simply involve
59
+ // extending the data type from float to double and
60
+ // then summing the squares of the data. LAPACK tests
61
+ // have shown that this approach can still lead to data overflow.
62
+ // Instead, we need to find the maximum absolute value in the entire
63
+ // array and divide each data element by this maximum value before
64
+ // performing the calculation. This approach can avoid overflow (and does not require extending the data type).
55
65
56
66
PROLOGUE
57
67
58
68
#ifdef F_INTERFACE
59
69
LDINT N, 0 (N)
60
70
LDINT INCX, 0 (INCX)
61
71
#endif
72
+ bge $r0, N, .L999
73
+ beq $r0, INCX, .L999
62
74
75
+ addi.d $sp, $sp, -32
76
+ st.d $ra, $sp, 0
77
+ st.d N, $sp, 8
78
+ st.d X, $sp, 16
79
+ st.d INCX, $sp, 24
80
+ #ifdef DYNAMIC_ARCH
81
+ bl samax_k_LA264
82
+ #else
83
+ bl samax_k
84
+ #endif
85
+ ld.d $ra, $sp, 0
86
+ ld.d N, $sp, 8
87
+ ld.d X, $sp, 16
88
+ ld.d INCX, $sp, 24
89
+ addi.d $sp, $sp, 32
90
+
91
+ frecip.s RCP, $f0
92
+ vreplvei.w $vr3, $vr2, 0
93
+ xvpermi.d VALPHA, $xr3,0x00
63
94
xvxor.v res1, res1, res1
64
95
xvxor.v res2, res2, res2
65
- bge $r0, N, .L999
66
- beq $r0, INCX , .L999
96
+ fcmp.ceq.s $fcc0, $f0, $f19
97
+ bcnez $fcc0 , .L999
67
98
li.d TEMP, SIZE
68
99
slli.d INCX, INCX, BASE_SHIFT
69
- srai.d I, N, 3
100
+ srai.d I, N, 4
70
101
bne INCX, TEMP, .L20
71
- bge $r0, I, .L997
102
+ bge $r0, I, .L997
72
103
.align 3
73
104
74
105
.L10:
75
- xvld VX0, X, 0
76
- xvfcvtl.d.s VX1, VX0
77
- xvfcvth.d.s VX2, VX0
78
- xvfmadd.d res1, VX1, VX1, res1
79
- xvfmadd.d res2, VX2, VX2, res2
106
+ xvld VX0, X, 0
107
+ xvld VX5, X, 8 * SIZE
80
108
addi.d I, I, -1
81
- addi.d X, X, 8 * SIZE
109
+ addi.d X, X, 16 * SIZE
110
+
111
+ xvfmul.s VX0, VX0, VALPHA
112
+ xvfmul.s VX5, VX5, VALPHA
113
+
114
+ xvfmadd.s res1, VX0, VX0, res1
115
+ xvfmadd.s res2, VX5, VX5, res2
82
116
blt $r0, I, .L10
83
- .align 3
84
117
b .L996
118
+ .align 3
85
119
86
120
.L20:
87
121
bge $r0, I, .L997
@@ -107,47 +141,75 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
107
141
ld.w t3, X, 0
108
142
add .d X, X, INCX
109
143
ld.w t4, X, 0
144
+ add .d X, X, INCX
110
145
xvinsgr2vr.w VX0, t1, 4
111
146
xvinsgr2vr.w VX0, t2, 5
112
147
xvinsgr2vr.w VX0, t3, 6
113
148
xvinsgr2vr.w VX0, t4, 7
149
+ xvfmul.s VX0, VX0, VALPHA
150
+ xvfmadd.s res1, VX0, VX0, res1
151
+
152
+ ld.w t1, X, 0
153
+ add .d X, X, INCX
154
+ ld.w t2, X, 0
114
155
add .d X, X, INCX
115
- xvfcvtl.d.s VX1, VX0
116
- xvfcvth.d.s VX2, VX0
117
- xvfmadd.d res1, VX1, VX1, res1
118
- xvfmadd.d res2, VX2, VX2, res2
156
+ ld.w t3, X, 0
157
+ add .d X, X, INCX
158
+ ld.w t4, X, 0
159
+ add .d X, X, INCX
160
+ xvinsgr2vr.w VX0, t1, 0
161
+ xvinsgr2vr.w VX0, t2, 1
162
+ xvinsgr2vr.w VX0, t3, 2
163
+ xvinsgr2vr.w VX0, t4, 3
164
+ ld.w t1, X, 0
165
+ add .d X, X, INCX
166
+ ld.w t2, X, 0
167
+ add .d X, X, INCX
168
+ ld.w t3, X, 0
169
+ add .d X, X, INCX
170
+ ld.w t4, X, 0
171
+ add .d X, X, INCX
172
+ xvinsgr2vr.w VX0, t1, 4
173
+ xvinsgr2vr.w VX0, t2, 5
174
+ xvinsgr2vr.w VX0, t3, 6
175
+ xvinsgr2vr.w VX0, t4, 7
176
+ xvfmul.s VX0, VX0, VALPHA
177
+ xvfmadd.s res2, VX0, VX0, res2
119
178
addi.d I, I, -1
120
179
blt $r0, I, .L21
121
- b .L996
180
+ .align 3
122
181
123
182
.L996:
124
- xvfadd.d res1, res1, res2
125
- xvpickve.d VX1, res1, 1
126
- xvpickve.d VX2, res1, 2
127
- xvpickve.d VX3, res1, 3
128
- fadd .d $f19, $f19, $f16
129
- fadd .d $f19, $f19, $f17
130
- fadd .d $f19, $f19, $f18
183
+ xvfadd.s res1, res1, res2
184
+ xvpermi.d VX1, res1, 0x4e
185
+ xvfadd.s res1, res1, VX1
186
+ vreplvei.w $vr16, $vr19, 1
187
+ vreplvei.w $vr17, $vr19, 2
188
+ vreplvei.w $vr18, $vr19, 3
189
+ xvfadd.s res1, VX1, res1
190
+ xvfadd.s res1, VX2, res1
191
+ xvfadd.s res1, VX3, res1
131
192
.align 3
132
193
133
194
.L997:
134
- andi I, N, 7
195
+ andi I, N, 15
135
196
bge $r0, I, .L999
136
197
.align 3
137
198
138
199
.L998:
139
200
fld .s $f15, X, 0
140
- add .d X, X, INCX
141
- addi.d I, I, -1
142
- fcvt.d. s $f15, $f15
143
- fmadd .d $f19, $f15, $f15, $f19
201
+ addi .d I, I, -1
202
+ fmul .s $f15, $f15, RCP
203
+ fmadd. s $f19, $ f15, $f15, $f19
204
+ add .d X, X, INCX
144
205
blt $r0, I, .L998
145
206
.align 3
146
207
147
208
.L999:
148
- fsqrt .d $f19, $f19
209
+ fsqrt .s $f19, $f19
210
+ fmul .s $f0, $f19, $f0
149
211
move $r4, $r17
150
- fcvt.s.d $f0, $f19
151
212
jirl $r0, $r1, 0x0
213
+ .align 3
152
214
153
215
EPILOGUE
0 commit comments