@@ -56,19 +56,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
56
56
#define VI3 $vr8
57
57
#define VI4 $vr19
58
58
#define VT0 $vr23
59
+ #define VZE $vr3
60
+ #define VT1 $vr4
61
+ #define VT2 $vr5
62
+ #define VC0 $vr6
59
63
60
64
PROLOGUE
61
65
li.d i0, 0
62
66
bge $r0, N, .L999
63
67
bge $r0, INCX, .L999
64
68
li.d TEMP, 1
69
+ vldi VZE, 0
65
70
slli.d TEMP, TEMP, BASE_SHIFT
66
71
slli.d INCX, INCX, BASE_SHIFT
67
72
bne INCX, TEMP, .L20
68
73
vld VM0, X, 0
69
74
#ifdef DOUBLE
75
+ vfsub.d VT1, VZE, VM0
70
76
addi.d i0, i0, 1
71
77
srai.d I, N, 3
78
+ vfmaxa.d VM0, VM0, VT1
72
79
bge $r0, I, .L11
73
80
slli.d i0, i0, 1 //2
74
81
vreplgr2vr.d VINC2, i0
@@ -79,12 +86,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
79
86
addi.d i0, i0, 1
80
87
vinsgr2vr.d VI1, i0, 1
81
88
addi.d i0, i0, 3
82
- vinsgr2vr.d VI0, i0, 0 //1
89
+ vinsgr2vr.d VI0, i0, 0 //initialize the index value for vectorization
83
90
addi.d i0, i0, 1
84
- vinsgr2vr.d VI0, i0, 1 //2
91
+ vinsgr2vr.d VI0, i0, 1
85
92
#else
93
+ vfsub.s VT1, VZE, VM0
86
94
addi.w i0, i0, 1
87
95
srai.d I, N, 3
96
+ vfmaxa.s VM0, VM0, VT1
88
97
bge $r0, I, .L21
89
98
slli.w i0, i0, 2 //4
90
99
vreplgr2vr.w VINC2, i0
@@ -115,39 +124,51 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
115
124
vadd.d VI1, VI1, VINC4
116
125
vld VX1, X, 2 * SIZE
117
126
vadd.d VI2, VI1, VINC2
118
- vfmaxa.d x1, VX0, VX1
119
- vfcmp.ceq.d VT0, VX0, x1
120
- vbitsel.v x2, VI2, VI1, VT0
127
+ vfsub.d VT1, VZE, VX0
128
+ vfsub.d VT2, VZE, VX1
129
+ vfmaxa.d VX0, VX0, VT1
130
+ vfmaxa.d VX1, VX1, VT2
131
+ vfcmp.clt.d VT0, VX0, VX1 //abx(x0) < abs(x1)
132
+ vbitsel.v x1, VX0, VX1, VT0 //abs(maxf)
133
+ vbitsel.v x2, VI1, VI2, VT0 //i
134
+
121
135
vld VX0, X, 4 * SIZE
122
136
vadd.d VI1, VI2, VINC2
123
137
vld VX1, X, 6 * SIZE
124
138
vadd.d VI2, VI1, VINC2
125
- vfmaxa.d x3, VX0, VX1
126
- vfcmp.ceq.d VT0, VX0, x3
127
- vbitsel.v x4, VI2, VI1, VT0
128
- vfmaxa.d x3, x1, x3
129
- vfcmp.ceq.d VT0, x1, x3
130
- vbitsel.v x2, x4, x2, VT0
131
- vfmaxa.d VM1, VM0, x3
132
- vfcmp.ceq.d VT0, VM0, VM1
133
- vbitsel.v VM0, VM1, VM0, VT0
134
- vbitsel.v VI0, x2, VI0, VT0
139
+ vfsub.d VT1, VZE, VX0
140
+ vfsub.d VT2, VZE, VX1
141
+ vfmaxa.d VX0, VX0, VT1
142
+ vfmaxa.d VX1, VX1, VT2
143
+ vfcmp.clt.d VT0, VX0, VX1
144
+ vbitsel.v x3, VX0, VX1, VT0 //abs(maxf)
145
+ vbitsel.v x4, VI1, VI2, VT0 //i
146
+ vfcmp.clt.d VC0, x1, x3
147
+ vbitsel.v x1, x1, x3, VC0 //abs(maxf)
148
+ vbitsel.v x2, x2, x4, VC0 //i
149
+ vfcmp.clt.d VT0, VM0, x1
135
150
addi.d I, I, -1
136
151
addi.d X, X, 8 * SIZE
152
+ vbitsel.v VM0, VM0, x1, VT0
153
+ vbitsel.v VI0, VI0, x2, VT0
137
154
#else
138
155
vld VX0, X, 0 * SIZE
139
156
vadd.w VI1, VI1, VINC4
140
157
vld VX1, X, 4 * SIZE
141
158
vadd.w VI2, VI1, VINC2
142
- vfmaxa.s VM1, VX0, VX1
143
- vfcmp.ceq.s VT0, VX0, VM1
159
+ vfsub.s VT1, VZE, VX0
160
+ vfsub.s VT2, VZE, VX1
161
+ vfmaxa.s VX0, VX0, VT1
162
+ vfmaxa.s VX1, VX1, VT2
163
+ vfcmp.clt.s VT0, VX0, VX1
164
+ vbitsel.v x1, VX0, VX1, VT0 //abs(maxf)
165
+ vbitsel.v x2, VI1, VI2, VT0 //i
144
166
addi.d I, I, -1
145
- vbitsel.v VI2, VI2, VI1, VT0
146
- vfmaxa.s VM1, VM0, VM1
147
- vfcmp.ceq.s VT0, VM0, VM1
167
+ vfcmp.clt.s VT0, VM0, x1
148
168
addi.d X, X, 8 * SIZE
149
- vbitsel.v VM0, VM1, VM0, VT0
150
- vbitsel.v VI0, VI2, VI0, VT0
169
+ vbitsel.v VM0, VM0, x1, VT0
170
+ vbitsel.v VI0, VI0, x2, VT0
171
+
151
172
#endif
152
173
blt $r0, I, .L10
153
174
.align 3
@@ -158,7 +179,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
158
179
vreplvei.d VI2, VI0, 1
159
180
vreplvei.d x1, VM0, 0
160
181
vreplvei.d x2, VM0, 1
161
- fcmp.ceq.d $fcc0, $f10 , $f9
182
+ fcmp.ceq.d $fcc0, $f9 , $f10
162
183
bceqz $fcc0, .L16
163
184
vfcmp.clt.d VT0, VI1, VI2
164
185
vbitsel.v VI0, VI2, VI1, VT0
@@ -172,28 +193,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
172
193
vreplvei.w x2, VM0, 1
173
194
vreplvei.w x3, VM0, 2
174
195
vreplvei.w x4, VM0, 3
175
- vfmaxa.s VM1, x1, x2
176
- vfcmp.ceq.s VT0, VM1, x1
177
- vbitsel.v VINC2, VI2, VI1, VT0
178
- vfmaxa.s VM0, x3, x4
179
- vfcmp.ceq.s VT0, x3, VM0
180
- vbitsel.v VINC4, VI4, VI3, VT0
181
- vfmaxa.s VM0, VM0, VM1
182
- vfcmp.ceq.s VT0, VM0, VM1
183
- vbitsel.v VI0, VINC4, VINC2, VT0
184
- fcmp.ceq.d $fcc0, $f15, $f9
185
- bceqz $fcc0, .L26
186
- vfcmp.clt.s VT0, VI1, VI0
187
- vbitsel.v VI0, VI0, VI1, VT0
188
196
b .L26
189
197
#endif
190
198
.align 3
191
199
192
200
#ifdef DOUBLE
193
201
.L16:
194
- vfmaxa.d VM0 , x1, x2
195
- vfcmp.ceq.d VT0, x1, VM0
196
- vbitsel.v VI0, VI2, VI1 , VT0
202
+ vfcmp.clt.d VT0 , x1, x2
203
+ vbitsel.v VI0, VI1, VI2, VT0
204
+ vbitsel.v VM0, x1, x2 , VT0
197
205
.align 3
198
206
199
207
.L17:
@@ -212,10 +220,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
212
220
213
221
.L13:
214
222
fld .d $f9, X, 0
215
- vfmaxa.d VM1, x1, VM0
216
- vfcmp.ceq.d VT0, VM0, VM1
217
- vbitsel.v VM0, VM1, VM0, VT0
218
- vbitsel.v VI0, VI1, VI0, VT0
223
+ fsub .d $f10, $f3, $f9
224
+ vfmaxa.d x1, x1, x2
225
+ vfcmp.clt.d VT0, VM0, x1
226
+ vbitsel.v VM0, VM0, x1, VT0
227
+ vbitsel.v VI0, VI0, VI1, VT0
219
228
addi.d I, I, -1
220
229
addi.d i1, i1, 1
221
230
addi.d X, X, SIZE
@@ -241,10 +250,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
241
250
add .d TEMP, TEMP, INCX
242
251
vinsgr2vr.d VM0, t2, 1
243
252
slli.d i0, i0, 1 //2
253
+ vfsub.d VT1, VZE, VM0
244
254
vreplgr2vr.d VINC2, i0
245
255
slli.d i0, i0, 1 //4
246
256
vreplgr2vr.d VINC4, i0
247
257
addi.d i0, i0, -7
258
+ vfmaxa.d VM0, VM0, VT1
248
259
vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
249
260
addi.d i0, i0, 1
250
261
vinsgr2vr.d VI1, i0, 1
@@ -269,9 +280,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
269
280
add .d X, X, INCX
270
281
vinsgr2vr.d VX1, t2, 1
271
282
vadd.d VI2, VI1, VINC2
272
- vfmaxa.d x1, VX0, VX1
273
- vfcmp.ceq.d VT0, VX0, x1
274
- vbitsel.v x2, VI2, VI1, VT0
283
+
284
+ vfsub.d VT1, VZE, VX0
285
+ vfsub.d VT2, VZE, VX1
286
+ vfmaxa.d VX0, VX0, VT1
287
+ vfmaxa.d VX1, VX1, VT2
288
+ vfcmp.clt.d VT0, VX0, VX1
289
+ vbitsel.v x1, VX0, VX1, VT0
290
+ vbitsel.v x2, VI1, VI2, VT0
275
291
ld.d t1, X, 0 * SIZE
276
292
add .d X, X, INCX
277
293
vinsgr2vr.d VX0, t1, 0
@@ -286,16 +302,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
286
302
add .d X, X, INCX
287
303
vinsgr2vr.d VX1, t2, 1
288
304
vadd.d VI2, VI1, VINC2
289
- vfmaxa.d x3, VX0, VX1
290
- vfcmp.ceq.d VT0, VX0, x3
291
- vbitsel.v x4, VI2, VI1, VT0
292
- vfmaxa.d x3, x1, x3
293
- vfcmp.ceq.d VT0, x1, x3
294
- vbitsel.v x2, x4, x2, VT0
295
- vfmaxa.d VM1, VM0, x3
296
- vbitsel.v VM0, VM1, VM0, VT0
297
- vfcmp.ceq.d VT0, VM0, VM1
298
- vbitsel.v VI0, x2, VI0, VT0
305
+ vfsub.d VT1, VZE, VX0
306
+ vfsub.d VT2, VZE, VX1
307
+ vfmaxa.d VX0, VX0, VT1
308
+ vfmaxa.d VX1, VX1, VT2
309
+ vfcmp.clt.d VT0, VX0, VX1
310
+ vbitsel.v x3, VX0, VX1, VT0
311
+ vbitsel.v x4, VI1, VI2, VT0
312
+ vfcmp.clt.d VC0, x1, x3
313
+ vbitsel.v x1, x1, x3, VC0
314
+ vbitsel.v x2, x2, x4, VC0
315
+ vfcmp.clt.d VT0, VM0, x1
316
+ vbitsel.v VM0, VM0, x1, VT0
317
+ vbitsel.v VI0, VI0, x2, VT0
318
+
299
319
addi.d I, I, -1
300
320
blt $r0, I, .L24
301
321
.align 3
@@ -313,9 +333,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
313
333
.align 3
314
334
315
335
.L26:
316
- vfmaxa.d VM0 , x1, x2
317
- vfcmp.ceq.d VT0, x1, VM0
318
- vbitsel.v VI0, VI2, VI1 , VT0
336
+ vfcmp.clt.d VT0 , x1, x2
337
+ vbitsel.v VI0, VI1, VI2, VT0
338
+ vbitsel.v VM0, x1, x2 , VT0
319
339
.align 3
320
340
321
341
.L27:
@@ -389,14 +409,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
389
409
vinsgr2vr.w VX1, t3, 2
390
410
vinsgr2vr.w VX1, t4, 3
391
411
vadd.w VI2, VI1, VINC2
392
- vfmaxa.s VM1, VX0, VX1
393
- vfcmp.ceq.s VT0, VX0, VM1
394
- vbitsel.v VI2, VI2, VI1, VT0
395
- vfmaxa.s VM1, VM0, VM1
396
- vfcmp.ceq.s VT0, VM0, VM1
412
+ vfsub.s VT1, VZE, VX0
413
+ vfsub.s VT2, VZE, VX1
414
+ vfmaxa.s VX0, VX0, VT1
415
+ vfmaxa.s VX1, VX1, VT2
416
+ vfcmp.clt.s VT0, VX0, VX1
417
+ vbitsel.v x1, VX0, VX1, VT0
418
+ vbitsel.v x2, VI1, VI2, VT0 //i
419
+
397
420
addi.d I, I, -1
398
- vbitsel.v VM0, VM1, VM0, VT0
399
- vbitsel.v VI0, VI2, VI0, VT0
421
+ vfcmp.clt.s VT0, VM0, x1
422
+ vbitsel.v VM0, VM0, x1, VT0
423
+ vbitsel.v VI0, VI0, x2, VT0
400
424
blt $r0, I, .L24
401
425
.align 3
402
426
@@ -409,42 +433,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
409
433
vreplvei.w x2, VM0, 1
410
434
vreplvei.w x3, VM0, 2
411
435
vreplvei.w x4, VM0, 3
412
- vfmaxa.s VM1, x1, x2
413
- vfcmp.ceq.s VT0, VM1, x1
414
- vbitsel.v VINC2, VI2, VI1, VT0
415
- vfmaxa.s VM0, x3, x4
416
- vfcmp.ceq.s VT0, x3, VM0
417
- vbitsel.v VINC4, VI4, VI3, VT0
418
- vfmaxa.s VM0, VM0, VM1
419
- vfcmp.ceq.s VT0, VM0, VM1
420
- vbitsel.v VI0, VINC4, VINC2, VT0
421
- fcmp.ceq.d $fcc0, $f15, $f9
422
- bceqz $fcc0, .L26
423
- vfcmp.clt.s VT0, VI1, VI0
424
- vbitsel.v VI0, VI0, VI1, VT0
425
436
.align 3
426
437
427
438
.L26:
428
- fcmp.ceq.d $fcc0, $f15, $f10
429
- bceqz $fcc0, .L27
430
- vfcmp.clt.s VT0, VI2, VI0
431
- vbitsel.v VI0, VI0, VI2, VT0
439
+ fcmp.ceq.s $fcc0, $f9, $f10
440
+ bceqz $fcc0, .L31
441
+ vfcmp.clt.s VT0, VI1, VI2
442
+ vbitsel.v VI1, VI2, VI1, VT0
443
+ b .L32
432
444
.align 3
433
-
434
- .L27:
435
- fcmp.ceq.d $fcc0, $f15, $f11
436
- bceqz $fcc0, .L28
437
- vfcmp.clt.s VT0, VI3, VI0
438
- vbitsel.v VI0, VI0, VI3, VT0
445
+ .L31:
446
+ vfcmp.clt.s VT0, x1, x2
447
+ vbitsel.v VI1, VI1, VI2, VT0
448
+ vbitsel.v x1, x1, x2, VT0
439
449
.align 3
440
-
441
- .L28:
442
- fcmp.ceq.d $fcc0, $f15, $f12
443
- bceqz $fcc0, .L29
444
- vfcmp.clt.s VT0, VI4, VI0
445
- vbitsel.v VI0, VI0, VI4, VT0
450
+ .L32:
451
+ fcmp.ceq.s $fcc0, $f11, $f12
452
+ bceqz $fcc0, .L33
453
+ vfcmp.clt.s VT1, VI3, VI4
454
+ vbitsel.v VI3, VI4, VI3, VT1
455
+ b .L34
456
+ .align 3
457
+ .L33:
458
+ vfcmp.clt.s VT1, x3, x4
459
+ vbitsel.v x3, x3, x4, VT1
460
+ vbitsel.v VI3, VI3, VI4, VT1
461
+ .align 3
462
+ .L34:
463
+ fcmp.ceq.s $fcc0, $f9, $f11
464
+ bceqz $fcc0, .L35
465
+ vfcmp.clt.s VT0, VI1, VI3
466
+ vbitsel.v VI0, VI3, VI1, VT0
467
+ vxor.v VM0, x1, VZE
468
+ b .L29
469
+ .align 3
470
+ .L35:
471
+ vfcmp.clt.s VT0, x1, x3
472
+ vbitsel.v VM0, x1, x3, VT0
473
+ vbitsel.v VI0, VI1, VI3, VT0
446
474
.align 3
447
-
448
475
.L29:
449
476
movfr2gr.s i0, $f20
450
477
.align 3
@@ -462,10 +489,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
462
489
463
490
.L22:
464
491
LD $f9, X, 0
465
- VFMAXA VM1, x1, VM0
466
- VCMPEQ VT0, VM0, VM1
467
- vbitsel.v VM0, VM1, VM0, VT0
468
- vbitsel.v VI0, VI1, VI0, VT0
492
+ #ifdef DOUBLE
493
+ fsub .d $f10, $f3, $f9
494
+ vfmaxa.d x1, x1, x2
495
+ vfcmp.clt.d VT0, VM0, x1
496
+ #else
497
+ fsub .s $f10, $f3, $f9
498
+ vfmaxa.s x1, x1, x2
499
+ vfcmp.clt.s VT0, VM0, x1
500
+ #endif
501
+ vbitsel.v VM0, VM0, x1, VT0
502
+ vbitsel.v VI0, VI0, VI1, VT0
469
503
addi.d I, I, -1
470
504
addi.d i1, i1, 1
471
505
add .d X, X, INCX
0 commit comments