Skip to content

Commit 7f1ebc7

Browse files
CNClareChenXiWeiGu
authored andcommitted
LoongArch64: Fixed iamax_lsx.S
Fixed index retrieval issue when there are identical maximum absolute values Signed-off-by: Hao Chen <chenhao@loongson.cn> Signed-off-by: gxw <guxiwei-hf@loongson.cn>
1 parent 31d326f commit 7f1ebc7

File tree

1 file changed

+134
-100
lines changed

1 file changed

+134
-100
lines changed

kernel/loongarch64/iamax_lsx.S

Lines changed: 134 additions & 100 deletions
Original file line numberDiff line numberDiff line change
@@ -56,19 +56,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
5656
#define VI3 $vr8
5757
#define VI4 $vr19
5858
#define VT0 $vr23
59+
#define VZE $vr3
60+
#define VT1 $vr4
61+
#define VT2 $vr5
62+
#define VC0 $vr6
5963

6064
PROLOGUE
6165
li.d i0, 0
6266
bge $r0, N, .L999
6367
bge $r0, INCX, .L999
6468
li.d TEMP, 1
69+
vldi VZE, 0
6570
slli.d TEMP, TEMP, BASE_SHIFT
6671
slli.d INCX, INCX, BASE_SHIFT
6772
bne INCX, TEMP, .L20
6873
vld VM0, X, 0
6974
#ifdef DOUBLE
75+
vfsub.d VT1, VZE, VM0
7076
addi.d i0, i0, 1
7177
srai.d I, N, 3
78+
vfmaxa.d VM0, VM0, VT1
7279
bge $r0, I, .L11
7380
slli.d i0, i0, 1 //2
7481
vreplgr2vr.d VINC2, i0
@@ -79,12 +86,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
7986
addi.d i0, i0, 1
8087
vinsgr2vr.d VI1, i0, 1
8188
addi.d i0, i0, 3
82-
vinsgr2vr.d VI0, i0, 0 //1
89+
vinsgr2vr.d VI0, i0, 0 //initialize the index value for vectorization
8390
addi.d i0, i0, 1
84-
vinsgr2vr.d VI0, i0, 1 //2
91+
vinsgr2vr.d VI0, i0, 1
8592
#else
93+
vfsub.s VT1, VZE, VM0
8694
addi.w i0, i0, 1
8795
srai.d I, N, 3
96+
vfmaxa.s VM0, VM0, VT1
8897
bge $r0, I, .L21
8998
slli.w i0, i0, 2 //4
9099
vreplgr2vr.w VINC2, i0
@@ -115,39 +124,51 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
115124
vadd.d VI1, VI1, VINC4
116125
vld VX1, X, 2 * SIZE
117126
vadd.d VI2, VI1, VINC2
118-
vfmaxa.d x1, VX0, VX1
119-
vfcmp.ceq.d VT0, VX0, x1
120-
vbitsel.v x2, VI2, VI1, VT0
127+
vfsub.d VT1, VZE, VX0
128+
vfsub.d VT2, VZE, VX1
129+
vfmaxa.d VX0, VX0, VT1
130+
vfmaxa.d VX1, VX1, VT2
131+
vfcmp.clt.d VT0, VX0, VX1 //abx(x0) < abs(x1)
132+
vbitsel.v x1, VX0, VX1, VT0 //abs(maxf)
133+
vbitsel.v x2, VI1, VI2, VT0 //i
134+
121135
vld VX0, X, 4 * SIZE
122136
vadd.d VI1, VI2, VINC2
123137
vld VX1, X, 6 * SIZE
124138
vadd.d VI2, VI1, VINC2
125-
vfmaxa.d x3, VX0, VX1
126-
vfcmp.ceq.d VT0, VX0, x3
127-
vbitsel.v x4, VI2, VI1, VT0
128-
vfmaxa.d x3, x1, x3
129-
vfcmp.ceq.d VT0, x1, x3
130-
vbitsel.v x2, x4, x2, VT0
131-
vfmaxa.d VM1, VM0, x3
132-
vfcmp.ceq.d VT0, VM0, VM1
133-
vbitsel.v VM0, VM1, VM0, VT0
134-
vbitsel.v VI0, x2, VI0, VT0
139+
vfsub.d VT1, VZE, VX0
140+
vfsub.d VT2, VZE, VX1
141+
vfmaxa.d VX0, VX0, VT1
142+
vfmaxa.d VX1, VX1, VT2
143+
vfcmp.clt.d VT0, VX0, VX1
144+
vbitsel.v x3, VX0, VX1, VT0 //abs(maxf)
145+
vbitsel.v x4, VI1, VI2, VT0 //i
146+
vfcmp.clt.d VC0, x1, x3
147+
vbitsel.v x1, x1, x3, VC0 //abs(maxf)
148+
vbitsel.v x2, x2, x4, VC0 //i
149+
vfcmp.clt.d VT0, VM0, x1
135150
addi.d I, I, -1
136151
addi.d X, X, 8 * SIZE
152+
vbitsel.v VM0, VM0, x1, VT0
153+
vbitsel.v VI0, VI0, x2, VT0
137154
#else
138155
vld VX0, X, 0 * SIZE
139156
vadd.w VI1, VI1, VINC4
140157
vld VX1, X, 4 * SIZE
141158
vadd.w VI2, VI1, VINC2
142-
vfmaxa.s VM1, VX0, VX1
143-
vfcmp.ceq.s VT0, VX0, VM1
159+
vfsub.s VT1, VZE, VX0
160+
vfsub.s VT2, VZE, VX1
161+
vfmaxa.s VX0, VX0, VT1
162+
vfmaxa.s VX1, VX1, VT2
163+
vfcmp.clt.s VT0, VX0, VX1
164+
vbitsel.v x1, VX0, VX1, VT0 //abs(maxf)
165+
vbitsel.v x2, VI1, VI2, VT0 //i
144166
addi.d I, I, -1
145-
vbitsel.v VI2, VI2, VI1, VT0
146-
vfmaxa.s VM1, VM0, VM1
147-
vfcmp.ceq.s VT0, VM0, VM1
167+
vfcmp.clt.s VT0, VM0, x1
148168
addi.d X, X, 8 * SIZE
149-
vbitsel.v VM0, VM1, VM0, VT0
150-
vbitsel.v VI0, VI2, VI0, VT0
169+
vbitsel.v VM0, VM0, x1, VT0
170+
vbitsel.v VI0, VI0, x2, VT0
171+
151172
#endif
152173
blt $r0, I, .L10
153174
.align 3
@@ -158,7 +179,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
158179
vreplvei.d VI2, VI0, 1
159180
vreplvei.d x1, VM0, 0
160181
vreplvei.d x2, VM0, 1
161-
fcmp.ceq.d $fcc0, $f10, $f9
182+
fcmp.ceq.d $fcc0, $f9, $f10
162183
bceqz $fcc0, .L16
163184
vfcmp.clt.d VT0, VI1, VI2
164185
vbitsel.v VI0, VI2, VI1, VT0
@@ -172,28 +193,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
172193
vreplvei.w x2, VM0, 1
173194
vreplvei.w x3, VM0, 2
174195
vreplvei.w x4, VM0, 3
175-
vfmaxa.s VM1, x1, x2
176-
vfcmp.ceq.s VT0, VM1, x1
177-
vbitsel.v VINC2, VI2, VI1, VT0
178-
vfmaxa.s VM0, x3, x4
179-
vfcmp.ceq.s VT0, x3, VM0
180-
vbitsel.v VINC4, VI4, VI3, VT0
181-
vfmaxa.s VM0, VM0, VM1
182-
vfcmp.ceq.s VT0, VM0, VM1
183-
vbitsel.v VI0, VINC4, VINC2, VT0
184-
fcmp.ceq.d $fcc0, $f15, $f9
185-
bceqz $fcc0, .L26
186-
vfcmp.clt.s VT0, VI1, VI0
187-
vbitsel.v VI0, VI0, VI1, VT0
188196
b .L26
189197
#endif
190198
.align 3
191199

192200
#ifdef DOUBLE
193201
.L16:
194-
vfmaxa.d VM0, x1, x2
195-
vfcmp.ceq.d VT0, x1, VM0
196-
vbitsel.v VI0, VI2, VI1, VT0
202+
vfcmp.clt.d VT0, x1, x2
203+
vbitsel.v VI0, VI1, VI2, VT0
204+
vbitsel.v VM0, x1, x2, VT0
197205
.align 3
198206

199207
.L17:
@@ -212,10 +220,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
212220

213221
.L13:
214222
fld.d $f9, X, 0
215-
vfmaxa.d VM1, x1, VM0
216-
vfcmp.ceq.d VT0, VM0, VM1
217-
vbitsel.v VM0, VM1, VM0, VT0
218-
vbitsel.v VI0, VI1, VI0, VT0
223+
fsub.d $f10, $f3, $f9
224+
vfmaxa.d x1, x1, x2
225+
vfcmp.clt.d VT0, VM0, x1
226+
vbitsel.v VM0, VM0, x1, VT0
227+
vbitsel.v VI0, VI0, VI1, VT0
219228
addi.d I, I, -1
220229
addi.d i1, i1, 1
221230
addi.d X, X, SIZE
@@ -241,10 +250,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
241250
add.d TEMP, TEMP, INCX
242251
vinsgr2vr.d VM0, t2, 1
243252
slli.d i0, i0, 1 //2
253+
vfsub.d VT1, VZE, VM0
244254
vreplgr2vr.d VINC2, i0
245255
slli.d i0, i0, 1 //4
246256
vreplgr2vr.d VINC4, i0
247257
addi.d i0, i0, -7
258+
vfmaxa.d VM0, VM0, VT1
248259
vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
249260
addi.d i0, i0, 1
250261
vinsgr2vr.d VI1, i0, 1
@@ -269,9 +280,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
269280
add.d X, X, INCX
270281
vinsgr2vr.d VX1, t2, 1
271282
vadd.d VI2, VI1, VINC2
272-
vfmaxa.d x1, VX0, VX1
273-
vfcmp.ceq.d VT0, VX0, x1
274-
vbitsel.v x2, VI2, VI1, VT0
283+
284+
vfsub.d VT1, VZE, VX0
285+
vfsub.d VT2, VZE, VX1
286+
vfmaxa.d VX0, VX0, VT1
287+
vfmaxa.d VX1, VX1, VT2
288+
vfcmp.clt.d VT0, VX0, VX1
289+
vbitsel.v x1, VX0, VX1, VT0
290+
vbitsel.v x2, VI1, VI2, VT0
275291
ld.d t1, X, 0 * SIZE
276292
add.d X, X, INCX
277293
vinsgr2vr.d VX0, t1, 0
@@ -286,16 +302,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
286302
add.d X, X, INCX
287303
vinsgr2vr.d VX1, t2, 1
288304
vadd.d VI2, VI1, VINC2
289-
vfmaxa.d x3, VX0, VX1
290-
vfcmp.ceq.d VT0, VX0, x3
291-
vbitsel.v x4, VI2, VI1, VT0
292-
vfmaxa.d x3, x1, x3
293-
vfcmp.ceq.d VT0, x1, x3
294-
vbitsel.v x2, x4, x2, VT0
295-
vfmaxa.d VM1, VM0, x3
296-
vbitsel.v VM0, VM1, VM0, VT0
297-
vfcmp.ceq.d VT0, VM0, VM1
298-
vbitsel.v VI0, x2, VI0, VT0
305+
vfsub.d VT1, VZE, VX0
306+
vfsub.d VT2, VZE, VX1
307+
vfmaxa.d VX0, VX0, VT1
308+
vfmaxa.d VX1, VX1, VT2
309+
vfcmp.clt.d VT0, VX0, VX1
310+
vbitsel.v x3, VX0, VX1, VT0
311+
vbitsel.v x4, VI1, VI2, VT0
312+
vfcmp.clt.d VC0, x1, x3
313+
vbitsel.v x1, x1, x3, VC0
314+
vbitsel.v x2, x2, x4, VC0
315+
vfcmp.clt.d VT0, VM0, x1
316+
vbitsel.v VM0, VM0, x1, VT0
317+
vbitsel.v VI0, VI0, x2, VT0
318+
299319
addi.d I, I, -1
300320
blt $r0, I, .L24
301321
.align 3
@@ -313,9 +333,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
313333
.align 3
314334

315335
.L26:
316-
vfmaxa.d VM0, x1, x2
317-
vfcmp.ceq.d VT0, x1, VM0
318-
vbitsel.v VI0, VI2, VI1, VT0
336+
vfcmp.clt.d VT0, x1, x2
337+
vbitsel.v VI0, VI1, VI2, VT0
338+
vbitsel.v VM0, x1, x2, VT0
319339
.align 3
320340

321341
.L27:
@@ -389,14 +409,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
389409
vinsgr2vr.w VX1, t3, 2
390410
vinsgr2vr.w VX1, t4, 3
391411
vadd.w VI2, VI1, VINC2
392-
vfmaxa.s VM1, VX0, VX1
393-
vfcmp.ceq.s VT0, VX0, VM1
394-
vbitsel.v VI2, VI2, VI1, VT0
395-
vfmaxa.s VM1, VM0, VM1
396-
vfcmp.ceq.s VT0, VM0, VM1
412+
vfsub.s VT1, VZE, VX0
413+
vfsub.s VT2, VZE, VX1
414+
vfmaxa.s VX0, VX0, VT1
415+
vfmaxa.s VX1, VX1, VT2
416+
vfcmp.clt.s VT0, VX0, VX1
417+
vbitsel.v x1, VX0, VX1, VT0
418+
vbitsel.v x2, VI1, VI2, VT0 //i
419+
397420
addi.d I, I, -1
398-
vbitsel.v VM0, VM1, VM0, VT0
399-
vbitsel.v VI0, VI2, VI0, VT0
421+
vfcmp.clt.s VT0, VM0, x1
422+
vbitsel.v VM0, VM0, x1, VT0
423+
vbitsel.v VI0, VI0, x2, VT0
400424
blt $r0, I, .L24
401425
.align 3
402426

@@ -409,42 +433,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
409433
vreplvei.w x2, VM0, 1
410434
vreplvei.w x3, VM0, 2
411435
vreplvei.w x4, VM0, 3
412-
vfmaxa.s VM1, x1, x2
413-
vfcmp.ceq.s VT0, VM1, x1
414-
vbitsel.v VINC2, VI2, VI1, VT0
415-
vfmaxa.s VM0, x3, x4
416-
vfcmp.ceq.s VT0, x3, VM0
417-
vbitsel.v VINC4, VI4, VI3, VT0
418-
vfmaxa.s VM0, VM0, VM1
419-
vfcmp.ceq.s VT0, VM0, VM1
420-
vbitsel.v VI0, VINC4, VINC2, VT0
421-
fcmp.ceq.d $fcc0, $f15, $f9
422-
bceqz $fcc0, .L26
423-
vfcmp.clt.s VT0, VI1, VI0
424-
vbitsel.v VI0, VI0, VI1, VT0
425436
.align 3
426437

427438
.L26:
428-
fcmp.ceq.d $fcc0, $f15, $f10
429-
bceqz $fcc0, .L27
430-
vfcmp.clt.s VT0, VI2, VI0
431-
vbitsel.v VI0, VI0, VI2, VT0
439+
fcmp.ceq.s $fcc0, $f9, $f10
440+
bceqz $fcc0, .L31
441+
vfcmp.clt.s VT0, VI1, VI2
442+
vbitsel.v VI1, VI2, VI1, VT0
443+
b .L32
432444
.align 3
433-
434-
.L27:
435-
fcmp.ceq.d $fcc0, $f15, $f11
436-
bceqz $fcc0, .L28
437-
vfcmp.clt.s VT0, VI3, VI0
438-
vbitsel.v VI0, VI0, VI3, VT0
445+
.L31:
446+
vfcmp.clt.s VT0, x1, x2
447+
vbitsel.v VI1, VI1, VI2, VT0
448+
vbitsel.v x1, x1, x2, VT0
439449
.align 3
440-
441-
.L28:
442-
fcmp.ceq.d $fcc0, $f15, $f12
443-
bceqz $fcc0, .L29
444-
vfcmp.clt.s VT0, VI4, VI0
445-
vbitsel.v VI0, VI0, VI4, VT0
450+
.L32:
451+
fcmp.ceq.s $fcc0, $f11, $f12
452+
bceqz $fcc0, .L33
453+
vfcmp.clt.s VT1, VI3, VI4
454+
vbitsel.v VI3, VI4, VI3, VT1
455+
b .L34
456+
.align 3
457+
.L33:
458+
vfcmp.clt.s VT1, x3, x4
459+
vbitsel.v x3, x3, x4, VT1
460+
vbitsel.v VI3, VI3, VI4, VT1
461+
.align 3
462+
.L34:
463+
fcmp.ceq.s $fcc0, $f9, $f11
464+
bceqz $fcc0, .L35
465+
vfcmp.clt.s VT0, VI1, VI3
466+
vbitsel.v VI0, VI3, VI1, VT0
467+
vxor.v VM0, x1, VZE
468+
b .L29
469+
.align 3
470+
.L35:
471+
vfcmp.clt.s VT0, x1, x3
472+
vbitsel.v VM0, x1, x3, VT0
473+
vbitsel.v VI0, VI1, VI3, VT0
446474
.align 3
447-
448475
.L29:
449476
movfr2gr.s i0, $f20
450477
.align 3
@@ -462,10 +489,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
462489

463490
.L22:
464491
LD $f9, X, 0
465-
VFMAXA VM1, x1, VM0
466-
VCMPEQ VT0, VM0, VM1
467-
vbitsel.v VM0, VM1, VM0, VT0
468-
vbitsel.v VI0, VI1, VI0, VT0
492+
#ifdef DOUBLE
493+
fsub.d $f10, $f3, $f9
494+
vfmaxa.d x1, x1, x2
495+
vfcmp.clt.d VT0, VM0, x1
496+
#else
497+
fsub.s $f10, $f3, $f9
498+
vfmaxa.s x1, x1, x2
499+
vfcmp.clt.s VT0, VM0, x1
500+
#endif
501+
vbitsel.v VM0, VM0, x1, VT0
502+
vbitsel.v VI0, VI0, VI1, VT0
469503
addi.d I, I, -1
470504
addi.d i1, i1, 1
471505
add.d X, X, INCX

0 commit comments

Comments
 (0)