1
+ #define ASSEMBLER
2
+
3
+ #include "common.h"
4
+
5
+ #define N $r4
6
+ #define X $r5
7
+ #define INCX $r6
8
+ #define I $r12
9
+ #define t1 $r13
10
+ #define t2 $r15
11
+ #define t3 $r18
12
+ #define t4 $r16
13
+ #define i0 $r17
14
+ #define i1 $r14
15
+ #define TEMP $r19
16
+ #define x1 $xr9
17
+ #define x2 $xr10
18
+ #define x3 $xr11
19
+ #define x4 $xr12
20
+ #define VX0 $xr13
21
+ #define VX1 $xr14
22
+ #define VM0 $xr15
23
+ #define VM1 $xr16
24
+ #define VINC4 $xr17
25
+ #define VINC8 $xr18
26
+ #define VI0 $xr20
27
+ #define VI1 $xr21
28
+ #define VI2 $xr22
29
+ #define VI3 $xr8
30
+ #define VI4 $xr19
31
+ #define VT0 $xr23
32
+
33
+ PROLOGUE
34
+ li.d i0, 0
35
+ bge $r0, N, .L999
36
+ bge $r0, INCX, .L999
37
+ li.d TEMP, 1
38
+ slli.d TEMP, TEMP, BASE_SHIFT
39
+ slli.d INCX, INCX, BASE_SHIFT
40
+ bne INCX, TEMP, .L20
41
+ xvld VM0, X, 0
42
+ addi.d i0, i0, 1
43
+ srai.d I, N, 3
44
+ bge $r0, I, .L21
45
+ slli.d i0, i0, 2 //4
46
+ xvreplgr2vr.d VINC4, i0
47
+ slli.d i0, i0, 1 //8
48
+ xvreplgr2vr.d VINC8, i0
49
+ addi.d i0, i0, -15
50
+ xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
51
+ addi.d i0, i0, 1
52
+ xvinsgr2vr.d VI1, i0, 1
53
+ addi.d i0, i0, 1
54
+ xvinsgr2vr.d VI1, i0, 2
55
+ addi.d i0, i0, 1
56
+ xvinsgr2vr.d VI1, i0, 3
57
+ addi.d i0, i0, 5
58
+ xvinsgr2vr.d VI0, i0, 0 //1
59
+ addi.d i0, i0, 1
60
+ xvinsgr2vr.d VI0, i0, 1 //2
61
+ addi.d i0, i0, 1
62
+ xvinsgr2vr.d VI0, i0, 2 //3
63
+ addi.d i0, i0, 1
64
+ xvinsgr2vr.d VI0, i0, 3 //4
65
+ .align 3
66
+
67
+ .L10:
68
+ xvld VX0, X, 0 * SIZE
69
+ xvadd.d VI1, VI1, VINC8
70
+ xvld VX1, X, 4 * SIZE
71
+ xvadd.d VI2, VI1, VINC4
72
+ xvfcmp.clt.d VT0, VX0, VX1
73
+ addi.d I, I, -1
74
+ xvbitsel.v VM1, VX0, VX1, VT0
75
+ xvbitsel.v VI2, VI1, VI2, VT0
76
+ xvfcmp.clt.d VT0, VM0, VM1
77
+ addi.d X, X, 8 * SIZE
78
+ xvbitsel.v VM0, VM0, VM1, VT0
79
+ xvbitsel.v VI0, VI0, VI2, VT0
80
+ blt $r0, I, .L10
81
+ .align 3
82
+
83
+ .L15:
84
+ xvpickve.d VI1, VI0, 0
85
+ xvpickve.d VI2, VI0, 1
86
+ xvpickve.d VI3, VI0, 2
87
+ xvpickve.d VI4, VI0, 3
88
+ xvpickve.d x1, VM0, 0
89
+ xvpickve.d x2, VM0, 1
90
+ xvpickve.d x3, VM0, 2
91
+ xvpickve.d x4, VM0, 3
92
+ xvfcmp.clt.d VT0, x1, x2
93
+ xvbitsel.v VM1, x1, x2, VT0
94
+ xvbitsel.v VINC4, VI1, VI2, VT0
95
+ xvfcmp.clt.d VT0, x3, x4
96
+ xvbitsel.v VM0, x3, x4, VT0
97
+ xvbitsel.v VINC8, VI3, VI4, VT0
98
+ xvfcmp.clt.d VT0, VM0, VM1
99
+ xvbitsel.v VM0, VM0, VM1, VT0
100
+ xvbitsel.v VI0, VINC8, VINC4, VT0
101
+ li.d TEMP, 1 //处理尾数相等时取最小序号
102
+ movgr2fr.d $f17, TEMP
103
+ ffint.d.l $f17, $f17
104
+ xvfcmp.ceq.d VT0, VM0, x1
105
+ fcmp.ceq.d $fcc0, $f23, $f17
106
+ bceqz $fcc0, .L26
107
+ xvfcmp.clt.d VT0, VI1, VI0
108
+ xvbitsel.v VI0, VI0, VI1, VT0
109
+ b .L26
110
+ .align 3
111
+
112
+
113
+ .L20: // INCX!=1
114
+ move TEMP, X
115
+ addi.d i0, i0, 1
116
+ ld.d t1, TEMP, 0 * SIZE
117
+ add .d TEMP, TEMP, INCX
118
+ xvinsgr2vr.d VM0, t1, 0
119
+ srai.d I, N, 3
120
+ bge $r0, I, .L21
121
+ ld.d t2, TEMP, 0 * SIZE
122
+ add .d TEMP, TEMP, INCX
123
+ ld.d t3, TEMP, 0 * SIZE
124
+ add .d TEMP, TEMP, INCX
125
+ ld.d t4, TEMP, 0 * SIZE
126
+ add .d TEMP, TEMP, INCX
127
+ xvinsgr2vr.d VM0, t2, 1
128
+ xvinsgr2vr.d VM0, t3, 2
129
+ xvinsgr2vr.d VM0, t4, 3
130
+ slli.d i0, i0, 2 //4
131
+ xvreplgr2vr.d VINC4, i0
132
+ slli.d i0, i0, 1 //8
133
+ xvreplgr2vr.d VINC8, i0
134
+ addi.d i0, i0, -15
135
+ xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
136
+ addi.d i0, i0, 1
137
+ xvinsgr2vr.d VI1, i0, 1
138
+ addi.d i0, i0, 1
139
+ xvinsgr2vr.d VI1, i0, 2
140
+ addi.d i0, i0, 1
141
+ xvinsgr2vr.d VI1, i0, 3
142
+ addi.d i0, i0, 5
143
+ xvinsgr2vr.d VI0, i0, 0 //1
144
+ addi.d i0, i0, 1
145
+ xvinsgr2vr.d VI0, i0, 1 //2
146
+ addi.d i0, i0, 1
147
+ xvinsgr2vr.d VI0, i0, 2 //3
148
+ addi.d i0, i0, 1
149
+ xvinsgr2vr.d VI0, i0, 3 //4
150
+ .align 3
151
+
152
+ .L24:
153
+ ld.d t1, X, 0 * SIZE
154
+ add .d X, X, INCX
155
+ ld.d t2, X, 0 * SIZE
156
+ add .d X, X, INCX
157
+ ld.d t3, X, 0 * SIZE
158
+ add .d X, X, INCX
159
+ ld.d t4, X, 0 * SIZE
160
+ add .d X, X, INCX
161
+ xvinsgr2vr.d VX0, t1, 0
162
+ xvinsgr2vr.d VX0, t2, 1
163
+ xvinsgr2vr.d VX0, t3, 2
164
+ xvinsgr2vr.d VX0, t4, 3
165
+ xvadd.d VI1, VI1, VINC8
166
+ ld.d t1, X, 0 * SIZE
167
+ add .d X, X, INCX
168
+ ld.d t2, X, 0 * SIZE
169
+ add .d X, X, INCX
170
+ ld.d t3, X, 0 * SIZE
171
+ add .d X, X, INCX
172
+ ld.d t4, X, 0 * SIZE
173
+ add .d X, X, INCX
174
+ xvinsgr2vr.d VX1, t1, 0
175
+ xvinsgr2vr.d VX1, t2, 1
176
+ xvinsgr2vr.d VX1, t3, 2
177
+ xvinsgr2vr.d VX1, t4, 3
178
+ xvadd.d VI2, VI1, VINC4
179
+ xvfcmp.clt.d VT0, VX0, VX1
180
+ addi.d I, I, -1
181
+ xvbitsel.v VM1, VX0, VX1, VT0
182
+ xvbitsel.v VI2, VI1, VI2, VT0
183
+ xvfcmp.clt.d VT0, VM0, VM1
184
+ xvbitsel.v VM0, VM0, VM1, VT0
185
+ xvbitsel.v VI0, VI0, VI2, VT0
186
+ blt $r0, I, .L24
187
+ .align 3
188
+
189
+ .L25:
190
+ xvpickve.d VI1, VI0, 0
191
+ xvpickve.d VI2, VI0, 1
192
+ xvpickve.d VI3, VI0, 2
193
+ xvpickve.d VI4, VI0, 3
194
+ xvpickve.d x1, VM0, 0
195
+ xvpickve.d x2, VM0, 1
196
+ xvpickve.d x3, VM0, 2
197
+ xvpickve.d x4, VM0, 3
198
+ xvfcmp.clt.d VT0, x1, x2
199
+ xvbitsel.v VM1, x1, x2, VT0
200
+ xvbitsel.v VINC4, VI1, VI2, VT0
201
+ xvfcmp.clt.d VT0, x3, x4
202
+ xvbitsel.v VM0, x3, x4, VT0
203
+ xvbitsel.v VINC8, VI3, VI4, VT0
204
+ xvfcmp.clt.d VT0, VM0, VM1
205
+ xvbitsel.v VM0, VM0, VM1, VT0
206
+ xvbitsel.v VI0, VINC8, VINC4, VT0
207
+ li.d TEMP, 1 //处理尾数相等时取最小序号
208
+ movgr2fr.d $f17, TEMP
209
+ ffint.d.l $f17, $f17
210
+ xvfcmp.ceq.d VT0, VM0, x1
211
+ fcmp.ceq.d $fcc0, $f23, $f17
212
+ bceqz $fcc0, .L26
213
+ xvfcmp.clt.d VT0, VI1, VI0
214
+ xvbitsel.v VI0, VI0, VI1, VT0
215
+ .align 3
216
+
217
+ .L26:
218
+ xvfcmp.ceq.d VT0, VM0, x2
219
+ fcmp.ceq.d $fcc0, $f23, $f17
220
+ bceqz $fcc0, .L27
221
+ xvfcmp.clt.d VT0, VI2, VI0
222
+ xvbitsel.v VI0, VI0, VI2, VT0
223
+ .align 3
224
+
225
+ .L27:
226
+ xvfcmp.ceq.d VT0, VM0, x3
227
+ fcmp.ceq.d $fcc0, $f23, $f17
228
+ bceqz $fcc0, .L28
229
+ xvfcmp.clt.d VT0, VI3, VI0
230
+ xvbitsel.v VI0, VI0, VI3, VT0
231
+ .align 3
232
+
233
+ .L28:
234
+ xvfcmp.ceq.d VT0, VM0, x4
235
+ fcmp.ceq.d $fcc0, $f23, $f17
236
+ bceqz $fcc0, .L29
237
+ xvfcmp.clt.d VT0, VI4, VI0
238
+ xvbitsel.v VI0, VI0, VI4, VT0
239
+ .align 3
240
+
241
+ .L29:
242
+ movfr2gr.d i0, $f20
243
+ .align 3
244
+
245
+ .L21: //N<8
246
+ andi I, N, 7
247
+ bge $r0, I, .L999
248
+ srai.d i1, N, 3
249
+ slli.d i1, i1, 3
250
+ addi.d i1, i1, 1 //current index
251
+ movgr2fr.d $f21, i1
252
+ movgr2fr.d $f20, i0
253
+ .align 3
254
+
255
+ .L22:
256
+ fld .d $f9, X, 0
257
+ addi.d I, I, -1
258
+ fcmp.clt.d $fcc0, $f15, $f9
259
+ add .d X, X, INCX
260
+ fsel $f15, $f15, $f9, $fcc0
261
+ fsel $f20, $f20, $f21, $fcc0
262
+ addi.d i1, i1, 1
263
+ movgr2fr.d $f21, i1
264
+ blt $r0, I, .L22
265
+ movfr2gr.d i0, $f20
266
+ .align 3
267
+
268
+ .L999:
269
+ move $r4, $r17
270
+ jirl $r0, $r1, 0x0
271
+ .align 3
272
+
273
+ EPILOGUE
0 commit comments