Skip to content

Commit b471fa3

Browse files
author
pengxu
committed
Loongarch64: fixed snrm2_lasx
1 parent 57bb46b commit b471fa3

File tree

1 file changed

+94
-32
lines changed

1 file changed

+94
-32
lines changed

kernel/loongarch64/snrm2_lasx.S

Lines changed: 94 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -43,45 +43,79 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
4343
#define t2 $r13
4444
#define t3 $r14
4545
#define t4 $r15
46-
47-
/* Don't change following FR unless you know the effects. */
4846
#define VX0 $xr15
4947
#define VX1 $xr16
5048
#define VX2 $xr17
5149
#define VX3 $xr18
5250
#define VX4 $xr21
51+
#define VX5 $xr22
52+
/* Don't change following FR unless you know the effects. */
5353
#define res1 $xr19
5454
#define res2 $xr20
55+
#define RCP $f2
56+
#define VALPHA $xr3
57+
58+
// The optimization for snrm2 cannot simply involve
59+
// extending the data type from float to double and
60+
// then summing the squares of the data. LAPACK tests
61+
// have shown that this approach can still lead to data overflow.
62+
// Instead, we need to find the maximum absolute value in the entire
63+
// array and divide each data element by this maximum value before
64+
// performing the calculation. This approach can avoid overflow (and does not require extending the data type).
5565

5666
PROLOGUE
5767

5868
#ifdef F_INTERFACE
5969
LDINT N, 0(N)
6070
LDINT INCX, 0(INCX)
6171
#endif
72+
bge $r0, N, .L999
73+
beq $r0, INCX, .L999
6274

75+
addi.d $sp, $sp, -32
76+
st.d $ra, $sp, 0
77+
st.d N, $sp, 8
78+
st.d X, $sp, 16
79+
st.d INCX, $sp, 24
80+
#ifdef DYNAMIC_ARCH
81+
bl samax_k_LA264
82+
#else
83+
bl samax_k
84+
#endif
85+
ld.d $ra, $sp, 0
86+
ld.d N, $sp, 8
87+
ld.d X, $sp, 16
88+
ld.d INCX, $sp, 24
89+
addi.d $sp, $sp, 32
90+
91+
frecip.s RCP, $f0
92+
vreplvei.w $vr3, $vr2, 0
93+
xvpermi.d VALPHA, $xr3,0x00
6394
xvxor.v res1, res1, res1
6495
xvxor.v res2, res2, res2
65-
bge $r0, N, .L999
66-
beq $r0, INCX, .L999
96+
fcmp.ceq.s $fcc0, $f0, $f19
97+
bcnez $fcc0, .L999
6798
li.d TEMP, SIZE
6899
slli.d INCX, INCX, BASE_SHIFT
69-
srai.d I, N, 3
100+
srai.d I, N, 4
70101
bne INCX, TEMP, .L20
71-
bge $r0, I, .L997
102+
bge $r0, I, .L997
72103
.align 3
73104

74105
.L10:
75-
xvld VX0, X, 0
76-
xvfcvtl.d.s VX1, VX0
77-
xvfcvth.d.s VX2, VX0
78-
xvfmadd.d res1, VX1, VX1, res1
79-
xvfmadd.d res2, VX2, VX2, res2
106+
xvld VX0, X, 0
107+
xvld VX5, X, 8 * SIZE
80108
addi.d I, I, -1
81-
addi.d X, X, 8 * SIZE
109+
addi.d X, X, 16 * SIZE
110+
111+
xvfmul.s VX0, VX0, VALPHA
112+
xvfmul.s VX5, VX5, VALPHA
113+
114+
xvfmadd.s res1, VX0, VX0, res1
115+
xvfmadd.s res2, VX5, VX5, res2
82116
blt $r0, I, .L10
83-
.align 3
84117
b .L996
118+
.align 3
85119

86120
.L20:
87121
bge $r0, I, .L997
@@ -107,47 +141,75 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
107141
ld.w t3, X, 0
108142
add.d X, X, INCX
109143
ld.w t4, X, 0
144+
add.d X, X, INCX
110145
xvinsgr2vr.w VX0, t1, 4
111146
xvinsgr2vr.w VX0, t2, 5
112147
xvinsgr2vr.w VX0, t3, 6
113148
xvinsgr2vr.w VX0, t4, 7
149+
xvfmul.s VX0, VX0, VALPHA
150+
xvfmadd.s res1, VX0, VX0, res1
151+
152+
ld.w t1, X, 0
153+
add.d X, X, INCX
154+
ld.w t2, X, 0
114155
add.d X, X, INCX
115-
xvfcvtl.d.s VX1, VX0
116-
xvfcvth.d.s VX2, VX0
117-
xvfmadd.d res1, VX1, VX1, res1
118-
xvfmadd.d res2, VX2, VX2, res2
156+
ld.w t3, X, 0
157+
add.d X, X, INCX
158+
ld.w t4, X, 0
159+
add.d X, X, INCX
160+
xvinsgr2vr.w VX0, t1, 0
161+
xvinsgr2vr.w VX0, t2, 1
162+
xvinsgr2vr.w VX0, t3, 2
163+
xvinsgr2vr.w VX0, t4, 3
164+
ld.w t1, X, 0
165+
add.d X, X, INCX
166+
ld.w t2, X, 0
167+
add.d X, X, INCX
168+
ld.w t3, X, 0
169+
add.d X, X, INCX
170+
ld.w t4, X, 0
171+
add.d X, X, INCX
172+
xvinsgr2vr.w VX0, t1, 4
173+
xvinsgr2vr.w VX0, t2, 5
174+
xvinsgr2vr.w VX0, t3, 6
175+
xvinsgr2vr.w VX0, t4, 7
176+
xvfmul.s VX0, VX0, VALPHA
177+
xvfmadd.s res2, VX0, VX0, res2
119178
addi.d I, I, -1
120179
blt $r0, I, .L21
121-
b .L996
180+
.align 3
122181

123182
.L996:
124-
xvfadd.d res1, res1, res2
125-
xvpickve.d VX1, res1, 1
126-
xvpickve.d VX2, res1, 2
127-
xvpickve.d VX3, res1, 3
128-
fadd.d $f19, $f19, $f16
129-
fadd.d $f19, $f19, $f17
130-
fadd.d $f19, $f19, $f18
183+
xvfadd.s res1, res1, res2
184+
xvpermi.d VX1, res1, 0x4e
185+
xvfadd.s res1, res1, VX1
186+
vreplvei.w $vr16, $vr19, 1
187+
vreplvei.w $vr17, $vr19, 2
188+
vreplvei.w $vr18, $vr19, 3
189+
xvfadd.s res1, VX1, res1
190+
xvfadd.s res1, VX2, res1
191+
xvfadd.s res1, VX3, res1
131192
.align 3
132193

133194
.L997:
134-
andi I, N, 7
195+
andi I, N, 15
135196
bge $r0, I, .L999
136197
.align 3
137198

138199
.L998:
139200
fld.s $f15, X, 0
140-
add.d X, X, INCX
141-
addi.d I, I, -1
142-
fcvt.d.s $f15, $f15
143-
fmadd.d $f19, $f15, $f15, $f19
201+
addi.d I, I, -1
202+
fmul.s $f15, $f15, RCP
203+
fmadd.s $f19, $f15, $f15, $f19
204+
add.d X, X, INCX
144205
blt $r0, I, .L998
145206
.align 3
146207

147208
.L999:
148-
fsqrt.d $f19, $f19
209+
fsqrt.s $f19, $f19
210+
fmul.s $f0, $f19, $f0
149211
move $r4, $r17
150-
fcvt.s.d $f0, $f19
151212
jirl $r0, $r1, 0x0
213+
.align 3
152214

153215
EPILOGUE

0 commit comments

Comments
 (0)