@@ -63,42 +63,60 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
63
63
bge $r0, N, .L999
64
64
bge $r0, INCX, .L999
65
65
li.d TEMP, 1
66
- li.w I, -1
67
66
slli.d TEMP, TEMP, ZBASE_SHIFT
68
67
slli.d INCX, INCX, ZBASE_SHIFT
69
- xvreplgr2vr.w neg1, I
70
- xvffint.s.w neg1, neg1
71
68
srai.d I, N, 3
72
69
bne INCX, TEMP, .L20
73
70
bge $r0, I, .L23
74
71
.align 3
75
72
76
73
.L10:
77
- xvld VX0, X, 0 * SIZE
78
- xvld VX1, X, 8 * SIZE
79
- addi.d I, I, -1
74
+ xvld VX0, X, 0
75
+ xvld VX1, X, 32
76
+ #ifdef DOUBLE
77
+ xvpickev.d x1, VX1, VX0
78
+ xvpickod.d x2, VX1, VX0
79
+ #else
80
80
xvpickev.w x1, VX1, VX0
81
81
xvpickod.w x2, VX1, VX0
82
- xvfmul.s x3, neg1, x1
83
- xvfmul.s x4, neg1, x2
84
- xvfcmp.clt.s VT0, x1, res0
85
- xvfcmp.clt.s VT1, x2, res0
86
- xvbitsel.v x1, x1, x3, VT0
87
- xvbitsel.v x2, x2, x4, VT1
82
+ #endif
83
+ XVFSUB x3, res0, x1
84
+ XVFSUB x4, res0, x2
85
+ XVFMAX x1, x1, x3
86
+ XVFMAX x2, x2, x4
87
+ XVFADD VM1, x1, x2
88
+ XVFMAX VM0, VM0, VM1
89
+ #ifdef DOUBLE
90
+ xvld VX0, X, 64
91
+ xvld VX1, X, 96
92
+ xvpickev.d x1, VX1, VX0
93
+ xvpickod.d x2, VX1, VX0
94
+ XVFSUB x3, res0, x1
95
+ XVFSUB x4, res0, x2
96
+ XVFMAX x1, x1, x3
97
+ XVFMAX x2, x2, x4
98
+ XVFADD VM1, x1, x2
99
+ XVFMAX VM0, VM0, VM1
100
+ #endif
101
+ addi.d I, I, -1
88
102
addi.d X, X, 16 * SIZE
89
- xvfadd.s VM1, x1, x2
90
- xvfmax.s VM0, VM0, VM1
91
103
blt $r0, I, .L10
92
104
.align 3
93
105
94
106
.L11:
107
+ #ifdef DOUBLE
108
+ xvpickve.d x1, VM0, 0
109
+ xvpickve.d x2, VM0, 1
110
+ XVFMAX VM0, x1, x2
111
+ #else
95
112
xvpickve.w x1, VM0, 0
96
113
xvpickve.w x2, VM0, 1
97
114
xvpickve.w x3, VM0, 2
98
115
xvpickve.w x4, VM0, 3
99
- xvfmax.s VM1, x1, x2
100
- xvfmax.s VM0, x3, x4
101
- xvfmax.s VM0, VM0, VM1
116
+ XVFMAX VM0, x1, x2
117
+ XVFMAX VM1, x3, x4
118
+ XVFMAX VM0, VM0, VM1
119
+ #endif
102
120
b .L23
103
121
.align 3
104
122
@@ -107,66 +125,66 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
107
125
.align 3
108
126
109
127
.L21:
110
- fld .s t1, X, 0 * SIZE
111
- fld .s t2, X, 1 * SIZE
128
+ LD t1, X, 0 * SIZE
129
+ LD t2, X, 1 * SIZE
112
130
add .d X, X, INCX
113
- fld .s t3, X, 0 * SIZE
114
- fld .s t4, X, 1 * SIZE
131
+ LD t3, X, 0 * SIZE
132
+ LD t4, X, 1 * SIZE
115
133
add .d X, X, INCX
116
- fabs .s t1, t1
117
- fabs .s t2, t2
118
- fabs .s t3, t3
119
- fabs .s t4, t4
120
- fadd .s t1, t1, t2
121
- fadd .s t3, t3, t4
122
- fmax.s s1, t1, t3
123
- fld .s t1, X, 0 * SIZE
124
- fld .s t2, X, 1 * SIZE
134
+ FABS t1, t1
135
+ FABS t2, t2
136
+ FABS t3, t3
137
+ FABS t4, t4
138
+ ADD t1, t1, t2
139
+ ADD t3, t3, t4
140
+ FMAX s1, t1, t3
141
+ LD t1, X, 0 * SIZE
142
+ LD t2, X, 1 * SIZE
125
143
add .d X, X, INCX
126
- fld .s t3, X, 0 * SIZE
127
- fld .s t4, X, 1 * SIZE
144
+ LD t3, X, 0 * SIZE
145
+ LD t4, X, 1 * SIZE
128
146
add .d X, X, INCX
129
- fabs .s t1, t1
130
- fabs .s t2, t2
131
- fabs .s t3, t3
132
- fabs .s t4, t4
133
- fadd .s t1, t1, t2
134
- fadd .s t3, t3, t4
135
- fmax.s s1, t1, t3
136
- fld .s t1, X, 0 * SIZE
137
- fld .s t2, X, 1 * SIZE
147
+ FABS t1, t1
148
+ FABS t2, t2
149
+ FABS t3, t3
150
+ FABS t4, t4
151
+ ADD t1, t1, t2
152
+ ADD t3, t3, t4
153
+ FMAX s1, t1, t3
154
+ LD t1, X, 0 * SIZE
155
+ LD t2, X, 1 * SIZE
138
156
add .d X, X, INCX
139
- fld .s t3, X, 0 * SIZE
140
- fld .s t4, X, 1 * SIZE
157
+ LD t3, X, 0 * SIZE
158
+ LD t4, X, 1 * SIZE
141
159
add .d X, X, INCX
142
- fabs .s t1, t1
143
- fabs .s t2, t2
144
- fabs .s t3, t3
145
- fabs .s t4, t4
160
+ FABS t1, t1
161
+ FABS t2, t2
162
+ FABS t3, t3
163
+ FABS t4, t4
146
164
addi.d I, I, -1
147
- fadd .s t1, t1, t2
148
- fadd .s t3, t3, t4
149
- fmax.s s3, t1, t3
150
- fld .s t1, X, 0 * SIZE
151
- fld .s t2, X, 1 * SIZE
165
+ ADD t1, t1, t2
166
+ ADD t3, t3, t4
167
+ FMAX s3, t1, t3
168
+ LD t1, X, 0 * SIZE
169
+ LD t2, X, 1 * SIZE
152
170
add .d X, X, INCX
153
- fld .s t3, X, 0 * SIZE
154
- fld .s t4, X, 1 * SIZE
171
+ LD t3, X, 0 * SIZE
172
+ LD t4, X, 1 * SIZE
155
173
add .d X, X, INCX
156
- fabs .s t1, t1
157
- fabs .s t2, t2
158
- fabs .s t3, t3
159
- fabs .s t4, t4
160
- fadd .s t1, t1, t2
161
- fadd .s t3, t3, t4
162
- fmax.s s4, t1, t3
174
+ FABS t1, t1
175
+ FABS t2, t2
176
+ FABS t3, t3
177
+ FABS t4, t4
178
+ ADD t1, t1, t2
179
+ ADD t3, t3, t4
180
+ FMAX s4, t1, t3
163
181
blt $r0, I, .L21
164
182
.align 3
165
183
166
184
.L22:
167
- fmax.s s1, s1, s2
168
- fmax.s s3, s3, s4
169
- fmax.s s1, s1, s3
185
+ FMAX s1, s1, s2
186
+ FMAX s3, s3, s4
187
+ FMAX s1, s1, s3
170
188
.align 3
171
189
172
190
.L23: //N<8
@@ -182,12 +200,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
182
200
FABS a1, a1
183
201
ADD a0, a0, a1
184
202
add .d X, X, INCX
185
- fmax.s s1, a0, s1
203
+ FMAX s1, a0, s1
186
204
blt $r0, I, .L24
187
205
.align 3
188
206
189
207
.L999:
190
- fmov.s $f0, $f22
208
+ MOV $f0, $f22
191
209
jirl $r0, $r1, 0x0
192
210
.align 3
193
211
0 commit comments