Skip to content

Commit 31d326f

Browse files
CNClareChenXiWeiGu
authored andcommitted
LoongArch64: Fixed dot_lsx.S
Fixed incorrect register usage in instructions Signed-off-by: gxw <guxiwei-hf@loongson.cn>
1 parent 5d6356b commit 31d326f

File tree

1 file changed

+29
-55
lines changed

1 file changed

+29
-55
lines changed

kernel/loongarch64/dot_lsx.S

Lines changed: 29 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -53,8 +53,8 @@ PROLOGUE
5353
#endif
5454

5555
/* init $f8 and $f9 to zero */
56-
SUB s1, s1, s1
57-
SUB s2, s2, s2
56+
vxor.v $vr8, $vr8, $vr8
57+
vxor.v $vr9, $vr9, $vr9
5858
slli.d INCX, INCX, BASE_SHIFT
5959
li.d TEMP, SIZE
6060
slli.d INCY, INCY, BASE_SHIFT
@@ -64,20 +64,6 @@ PROLOGUE
6464

6565
/* !((inc_x == 1) && (inc_y == 1)) */
6666

67-
/* init $vr8 and $vr9 to zero */
68-
#ifdef DOUBLE
69-
vldrepl.d $vr0, X, 0
70-
#else
71-
vldrepl.w $vr0, X, 0
72-
#endif
73-
#ifdef DSDOT
74-
vfcvtl.d.s $vr0, $vr0
75-
vfsub.d $vr8, $vr0, $vr0
76-
vfsub.d $vr9, $vr0, $vr0
77-
#else
78-
VFSUB $vr8, $vr0, $vr0
79-
VFSUB $vr9, $vr0, $vr0
80-
#endif
8167

8268
#ifdef DOUBLE
8369
srai.d I, N, 3
@@ -99,31 +85,31 @@ PROLOGUE
9985
addi.w I, I, -1
10086
addi.d X, X, 64
10187
addi.d Y, Y, 64
102-
#ifdef DSDOT
88+
#ifndef DOUBLE
10389
vfcvtl.d.s $vr10, $vr0
10490
vfcvtl.d.s $vr11, $vr4
10591
vfcvth.d.s $vr12, $vr0
10692
vfcvth.d.s $vr13, $vr4
107-
vfmadd.d $vr8, $vr10, $vr12, $vr8
108-
vfmadd.d $vr9, $vr11, $vr13, $vr9
93+
vfmadd.d $vr8, $vr10, $vr11, $vr8
94+
vfmadd.d $vr9, $vr12, $vr13, $vr9
10995
vfcvtl.d.s $vr10, $vr1
11096
vfcvtl.d.s $vr11, $vr5
11197
vfcvth.d.s $vr12, $vr1
11298
vfcvth.d.s $vr13, $vr5
113-
vfmadd.d $vr8, $vr10, $vr12, $vr8
114-
vfmadd.d $vr9, $vr11, $vr13, $vr9
99+
vfmadd.d $vr8, $vr10, $vr11, $vr8
100+
vfmadd.d $vr9, $vr12, $vr13, $vr9
115101
vfcvtl.d.s $vr10, $vr2
116102
vfcvtl.d.s $vr11, $vr6
117103
vfcvth.d.s $vr12, $vr2
118104
vfcvth.d.s $vr13, $vr6
119-
vfmadd.d $vr8, $vr10, $vr12, $vr8
120-
vfmadd.d $vr9, $vr11, $vr13, $vr9
105+
vfmadd.d $vr8, $vr10, $vr11, $vr8
106+
vfmadd.d $vr9, $vr12, $vr13, $vr9
121107
vfcvtl.d.s $vr10, $vr3
122108
vfcvtl.d.s $vr11, $vr7
123109
vfcvth.d.s $vr12, $vr3
124110
vfcvth.d.s $vr13, $vr7
125-
vfmadd.d $vr8, $vr10, $vr12, $vr8
126-
vfmadd.d $vr9, $vr11, $vr13, $vr9
111+
vfmadd.d $vr8, $vr10, $vr11, $vr8
112+
vfmadd.d $vr9, $vr12, $vr13, $vr9
127113
#else
128114
VFMADD $vr8, $vr0, $vr4, $vr8
129115
VFMADD $vr9, $vr1, $vr5, $vr9
@@ -149,37 +135,24 @@ PROLOGUE
149135
addi.w I, I, -1
150136
addi.d X, X, 16
151137
addi.d Y, Y, 16
152-
#ifdef DSDOT
138+
#ifndef DOUBLE
153139
vfcvtl.d.s $vr10, $vr0
154140
vfcvtl.d.s $vr11, $vr4
155141
vfcvth.d.s $vr12, $vr0
156142
vfcvth.d.s $vr13, $vr4
157-
vfmadd.d $vr8, $vr10, $vr12, $vr8
158-
vfmadd.d $vr9, $vr11, $vr13, $vr9
143+
vfmadd.d $vr8, $vr10, $vr11, $vr8
144+
vfmadd.d $vr9, $vr12, $vr13, $vr9
159145
#else
160146
VFMADD $vr8, $vr0, $vr4, $vr8
161147
#endif
162148
bnez I, .L13
163149
.align 3
164150
.L14:
165151
/* store dot in s1 $f8 */
166-
#ifdef DSDOT
167152
vfadd.d $vr8, $vr8, $vr9
168-
fsub.s s2, s2, s2 /* set s2 to 0.0 */
153+
fsub.d s2, s2, s2 /* set s2 to 0.0 */
169154
vpackod.d $vr0, $vr8, $vr8
170155
vfadd.d $vr8, $vr8, $vr0
171-
#else
172-
VFADD $vr8, $vr8, $vr9
173-
SUB s2, s2, s2 /* set s2 to 0.0 */
174-
vpackod.d $vr0, $vr8, $vr8
175-
#ifdef DOUBLE
176-
VFADD $vr8, $vr8, $vr0
177-
#else
178-
VFADD $vr8, $vr8, $vr0
179-
vpackod.w $vr0, $vr8, $vr8
180-
VFADD $vr8, $vr8, $vr0
181-
#endif /* defined DOUBLE */
182-
#endif /* defined DSDOT */
183156
.align 3
184157
.L15:
185158
#ifdef DOUBLE
@@ -193,7 +166,7 @@ PROLOGUE
193166
/* DOUBLE: 1 ; FLOAT: 1~3 */
194167
LD a1, X, 0
195168
LD b1, Y, 0
196-
#ifdef DSDOT
169+
#ifndef DOUBLE
197170
fcvt.d.s a1, a1
198171
fcvt.d.s b1, b1
199172
fmadd.d s1, b1, a1, s1
@@ -236,7 +209,7 @@ PROLOGUE
236209
add.d X, X, INCX
237210
LD b1, Y, 0 * SIZE
238211
add.d Y, Y, INCY
239-
#ifdef DSDOT
212+
#ifndef DOUBLE
240213
fcvt.d.s a1, a1
241214
fcvt.d.s b1, b1
242215
fmadd.d s1, b1, a1, s1
@@ -248,7 +221,7 @@ PROLOGUE
248221
add.d X, X, INCX
249222
LD b1, Y, 0 * SIZE
250223
add.d Y, Y, INCY
251-
#ifdef DSDOT
224+
#ifndef DOUBLE
252225
fcvt.d.s a1, a1
253226
fcvt.d.s b1, b1
254227
fmadd.d s2, b1, a1, s2
@@ -260,7 +233,7 @@ PROLOGUE
260233
add.d X, X, INCX
261234
LD b1, Y, 0 * SIZE
262235
add.d Y, Y, INCY
263-
#ifdef DSDOT
236+
#ifndef DOUBLE
264237
fcvt.d.s a1, a1
265238
fcvt.d.s b1, b1
266239
fmadd.d s1, b1, a1, s1
@@ -272,7 +245,7 @@ PROLOGUE
272245
add.d X, X, INCX
273246
LD b1, Y, 0 * SIZE
274247
add.d Y, Y, INCY
275-
#ifdef DSDOT
248+
#ifndef DOUBLE
276249
fcvt.d.s a1, a1
277250
fcvt.d.s b1, b1
278251
fmadd.d s2, b1, a1, s2
@@ -284,7 +257,7 @@ PROLOGUE
284257
add.d X, X, INCX
285258
LD b1, Y, 0 * SIZE
286259
add.d Y, Y, INCY
287-
#ifdef DSDOT
260+
#ifndef DOUBLE
288261
fcvt.d.s a1, a1
289262
fcvt.d.s b1, b1
290263
fmadd.d s1, b1, a1, s1
@@ -296,7 +269,7 @@ PROLOGUE
296269
add.d X, X, INCX
297270
LD b1, Y, 0 * SIZE
298271
add.d Y, Y, INCY
299-
#ifdef DSDOT
272+
#ifndef DOUBLE
300273
fcvt.d.s a1, a1
301274
fcvt.d.s b1, b1
302275
fmadd.d s2, b1, a1, s2
@@ -308,7 +281,7 @@ PROLOGUE
308281
add.d X, X, INCX
309282
LD b1, Y, 0 * SIZE
310283
add.d Y, Y, INCY
311-
#ifdef DSDOT
284+
#ifndef DOUBLE
312285
fcvt.d.s a1, a1
313286
fcvt.d.s b1, b1
314287
fmadd.d s1, b1, a1, s1
@@ -321,7 +294,7 @@ PROLOGUE
321294
LD b1, Y, 0 * SIZE
322295
add.d Y, Y, INCY
323296
addi.d I, I, -1
324-
#ifdef DSDOT
297+
#ifndef DOUBLE
325298
fcvt.d.s a1, a1
326299
fcvt.d.s b1, b1
327300
fmadd.d s2, b1, a1, s2
@@ -342,7 +315,7 @@ PROLOGUE
342315
LD b1, Y, 0 * SIZE
343316
add.d Y, Y, INCY
344317
addi.d I, I, -1
345-
#ifdef DSDOT
318+
#ifndef DOUBLE
346319
fcvt.d.s a1, a1
347320
fcvt.d.s b1, b1
348321
fmadd.d s1, b1, a1, s1
@@ -353,12 +326,13 @@ PROLOGUE
353326
.align 3
354327

355328
.L999:
356-
#ifdef DSDOT
357329
fadd.d $f0, s1, s2
330+
move $r4, $r17
331+
#if defined(DOUBLE)
332+
#elif defined(DSDOT)
358333
#else
359-
ADD $f0, s1, s2
334+
fcvt.s.d $f0, $f0
360335
#endif
361-
move $r4, $r17
362336
jirl $r0, $r1, 0x0
363337

364338
EPILOGUE

0 commit comments

Comments
 (0)