1
+ #define ASSEMBLER
2
+
3
+ #include "common.h"
4
+
5
+ #define N $r4
6
+ #define X $r5
7
+ #define INCX $r6
8
+ #define I $r12
9
+ #define t1 $r13
10
+ #define t2 $r15
11
+ #define t3 $r18
12
+ #define t4 $r16
13
+ #define i0 $r17
14
+ #define i1 $r14
15
+ #define TEMP $r19
16
+ #define x1 $xr9
17
+ #define x2 $xr10
18
+ #define x3 $xr11
19
+ #define x4 $xr12
20
+ #define VX0 $xr13
21
+ #define VX1 $xr14
22
+ #define VM0 $xr15
23
+ #define VM1 $xr16
24
+ #define VINC4 $xr17
25
+ #define VINC8 $xr18
26
+ #define VI0 $xr20
27
+ #define VI1 $xr21
28
+ #define VI2 $xr22
29
+ #define VI3 $xr8
30
+ #define VI4 $xr19
31
+ #define VT0 $xr23
32
+
33
+ PROLOGUE
34
+ li.d i0, 0
35
+ bge $r0, N, .L999
36
+ bge $r0, INCX, .L999
37
+ li.d TEMP, 1
38
+ slli.d TEMP, TEMP, BASE_SHIFT
39
+ slli.d INCX, INCX, BASE_SHIFT
40
+ bne INCX, TEMP, .L20
41
+ xvld VM0, X, 0
42
+ addi.d i0, i0, 1
43
+ srai.d I, N, 3
44
+ bge $r0, I, .L21
45
+ slli.d i0, i0, 2 //4
46
+ xvreplgr2vr.d VINC4, i0
47
+ slli.d i0, i0, 1 //8
48
+ xvreplgr2vr.d VINC8, i0
49
+ addi.d i0, i0, -15
50
+ xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
51
+ addi.d i0, i0, 1
52
+ xvinsgr2vr.d VI1, i0, 1
53
+ addi.d i0, i0, 1
54
+ xvinsgr2vr.d VI1, i0, 2
55
+ addi.d i0, i0, 1
56
+ xvinsgr2vr.d VI1, i0, 3
57
+ addi.d i0, i0, 5
58
+ xvinsgr2vr.d VI0, i0, 0 //1
59
+ addi.d i0, i0, 1
60
+ xvinsgr2vr.d VI0, i0, 1 //2
61
+ addi.d i0, i0, 1
62
+ xvinsgr2vr.d VI0, i0, 2 //3
63
+ addi.d i0, i0, 1
64
+ xvinsgr2vr.d VI0, i0, 3 //4
65
+ .align 3
66
+
67
+ .L10:
68
+ xvld VX0, X, 0 * SIZE
69
+ xvadd.d VI1, VI1, VINC8
70
+ xvld VX1, X, 4 * SIZE
71
+ xvadd.d VI2, VI1, VINC4
72
+ xvfcmp.clt.d VT0, VX1, VX0
73
+ addi.d I, I, -1
74
+ xvbitsel.v VM1, VX0, VX1, VT0
75
+ xvbitsel.v VI2, VI1, VI2, VT0
76
+ xvfcmp.clt.d VT0, VM1, VM0
77
+ addi.d X, X, 8 * SIZE
78
+ xvbitsel.v VM0, VM0, VM1, VT0
79
+ xvbitsel.v VI0, VI0, VI2, VT0
80
+ blt $r0, I, .L10
81
+ .align 3
82
+
83
+ .L15:
84
+ xvpickve.d VI1, VI0, 0
85
+ xvpickve.d VI2, VI0, 1
86
+ xvpickve.d VI3, VI0, 2
87
+ xvpickve.d VI4, VI0, 3
88
+ xvpickve.d x1, VM0, 0
89
+ xvpickve.d x2, VM0, 1
90
+ xvpickve.d x3, VM0, 2
91
+ xvpickve.d x4, VM0, 3
92
+ xvfcmp.clt.d VT0, x2, x1
93
+ xvbitsel.v VM1, x1, x2, VT0
94
+ xvbitsel.v VINC4, VI1, VI2, VT0
95
+ xvfcmp.clt.d VT0, x4, x3
96
+ xvbitsel.v VM0, x3, x4, VT0
97
+ xvbitsel.v VINC8, VI3, VI4, VT0
98
+ xvfcmp.clt.d VT0, VM1, VM0
99
+ xvbitsel.v VM0, VM0, VM1, VT0
100
+ xvbitsel.v VI0, VINC8, VINC4, VT0
101
+ li.d TEMP, 1 //处理尾数相等时取最小序号
102
+ movgr2fr.d $f17, TEMP
103
+ ffint.d.l $f17, $f17
104
+ xvfcmp.ceq.d VT0, VM0, x1
105
+ fcmp.ceq.d $fcc0, $f23, $f17
106
+ bceqz $fcc0, .L26
107
+ xvfcmp.clt.d VT0, VI1, VI0
108
+ xvbitsel.v VI0, VI0, VI1, VT0
109
+ b .L26
110
+ .align 3
111
+
112
+ .L20: // INCX!=1
113
+ move TEMP, X
114
+ addi.d i0, i0, 1
115
+ ld.d t1, TEMP, 0 * SIZE
116
+ add .d TEMP, TEMP, INCX
117
+ xvinsgr2vr.d VM0, t1, 0
118
+ srai.d I, N, 3
119
+ bge $r0, I, .L21
120
+ ld.d t2, TEMP, 0 * SIZE
121
+ add .d TEMP, TEMP, INCX
122
+ ld.d t3, TEMP, 0 * SIZE
123
+ add .d TEMP, TEMP, INCX
124
+ ld.d t4, TEMP, 0 * SIZE
125
+ add .d TEMP, TEMP, INCX
126
+ xvinsgr2vr.d VM0, t2, 1
127
+ xvinsgr2vr.d VM0, t3, 2
128
+ xvinsgr2vr.d VM0, t4, 3
129
+ slli.d i0, i0, 2 //4
130
+ xvreplgr2vr.d VINC4, i0
131
+ slli.d i0, i0, 1 //8
132
+ xvreplgr2vr.d VINC8, i0
133
+ addi.d i0, i0, -15
134
+ xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
135
+ addi.d i0, i0, 1
136
+ xvinsgr2vr.d VI1, i0, 1
137
+ addi.d i0, i0, 1
138
+ xvinsgr2vr.d VI1, i0, 2
139
+ addi.d i0, i0, 1
140
+ xvinsgr2vr.d VI1, i0, 3
141
+ addi.d i0, i0, 5
142
+ xvinsgr2vr.d VI0, i0, 0 //1
143
+ addi.d i0, i0, 1
144
+ xvinsgr2vr.d VI0, i0, 1 //2
145
+ addi.d i0, i0, 1
146
+ xvinsgr2vr.d VI0, i0, 2 //3
147
+ addi.d i0, i0, 1
148
+ xvinsgr2vr.d VI0, i0, 3 //4
149
+ .align 3
150
+
151
+ .L24:
152
+ ld.d t1, X, 0 * SIZE
153
+ add .d X, X, INCX
154
+ ld.d t2, X, 0 * SIZE
155
+ add .d X, X, INCX
156
+ ld.d t3, X, 0 * SIZE
157
+ add .d X, X, INCX
158
+ ld.d t4, X, 0 * SIZE
159
+ add .d X, X, INCX
160
+ xvinsgr2vr.d VX0, t1, 0
161
+ xvinsgr2vr.d VX0, t2, 1
162
+ xvinsgr2vr.d VX0, t3, 2
163
+ xvinsgr2vr.d VX0, t4, 3
164
+ xvadd.d VI1, VI1, VINC8
165
+ ld.d t1, X, 0 * SIZE
166
+ add .d X, X, INCX
167
+ ld.d t2, X, 0 * SIZE
168
+ add .d X, X, INCX
169
+ ld.d t3, X, 0 * SIZE
170
+ add .d X, X, INCX
171
+ ld.d t4, X, 0 * SIZE
172
+ add .d X, X, INCX
173
+ xvinsgr2vr.d VX1, t1, 0
174
+ xvinsgr2vr.d VX1, t2, 1
175
+ xvinsgr2vr.d VX1, t3, 2
176
+ xvinsgr2vr.d VX1, t4, 3
177
+ xvadd.d VI2, VI1, VINC4
178
+ xvfcmp.clt.d VT0, VX1, VX0
179
+ addi.d I, I, -1
180
+ xvbitsel.v VM1, VX0, VX1, VT0
181
+ xvbitsel.v VI2, VI1, VI2, VT0
182
+ xvfcmp.clt.d VT0, VM1, VM0
183
+ xvbitsel.v VM0, VM0, VM1, VT0
184
+ xvbitsel.v VI0, VI0, VI2, VT0
185
+ blt $r0, I, .L24
186
+ .align 3
187
+
188
+ .L25:
189
+ xvpickve.d VI1, VI0, 0
190
+ xvpickve.d VI2, VI0, 1
191
+ xvpickve.d VI3, VI0, 2
192
+ xvpickve.d VI4, VI0, 3
193
+ xvpickve.d x1, VM0, 0
194
+ xvpickve.d x2, VM0, 1
195
+ xvpickve.d x3, VM0, 2
196
+ xvpickve.d x4, VM0, 3
197
+ xvfcmp.clt.d VT0, x2, x1
198
+ xvbitsel.v VM1, x1, x2, VT0
199
+ xvbitsel.v VINC4, VI1, VI2, VT0
200
+ xvfcmp.clt.d VT0, x4, x3
201
+ xvbitsel.v VM0, x3, x4, VT0
202
+ xvbitsel.v VINC8, VI3, VI4, VT0
203
+ xvfcmp.clt.d VT0, VM1, VM0
204
+ xvbitsel.v VM0, VM0, VM1, VT0
205
+ xvbitsel.v VI0, VINC8, VINC4, VT0
206
+ li.d TEMP, 1 //处理尾数相等时取最小序号
207
+ movgr2fr.d $f17, TEMP
208
+ ffint.d.l $f17, $f17
209
+ xvfcmp.ceq.d VT0, VM0, x1
210
+ fcmp.ceq.d $fcc0, $f23, $f17
211
+ bceqz $fcc0, .L26
212
+ xvfcmp.clt.d VT0, VI1, VI0
213
+ xvbitsel.v VI0, VI0, VI1, VT0
214
+ .align 3
215
+
216
+ .L26:
217
+ xvfcmp.ceq.d VT0, VM0, x2
218
+ fcmp.ceq.d $fcc0, $f23, $f17
219
+ bceqz $fcc0, .L27
220
+ xvfcmp.clt.d VT0, VI2, VI0
221
+ xvbitsel.v VI0, VI0, VI2, VT0
222
+ .align 3
223
+
224
+ .L27:
225
+ xvfcmp.ceq.d VT0, VM0, x3
226
+ fcmp.ceq.d $fcc0, $f23, $f17
227
+ bceqz $fcc0, .L28
228
+ xvfcmp.clt.d VT0, VI3, VI0
229
+ xvbitsel.v VI0, VI0, VI3, VT0
230
+ .align 3
231
+
232
+ .L28:
233
+ xvfcmp.ceq.d VT0, VM0, x4
234
+ fcmp.ceq.d $fcc0, $f23, $f17
235
+ bceqz $fcc0, .L29
236
+ xvfcmp.clt.d VT0, VI4, VI0
237
+ xvbitsel.v VI0, VI0, VI4, VT0
238
+ .align 3
239
+
240
+ .L29:
241
+ movfr2gr.d i0, $f20
242
+ .align 3
243
+
244
+ .L21: //N<8
245
+ andi I, N, 7
246
+ bge $r0, I, .L999
247
+ srai.d i1, N, 3
248
+ slli.d i1, i1, 3
249
+ addi.d i1, i1, 1 //current index
250
+ movgr2fr.d $f21, i1
251
+ movgr2fr.d $f20, i0
252
+ .align 3
253
+
254
+ .L22:
255
+ fld .d $f9, X, 0
256
+ addi.d I, I, -1
257
+ fcmp.clt.d $fcc0, $f9, $f15
258
+ add .d X, X, INCX
259
+ fsel $f15, $f15, $f9, $fcc0
260
+ fsel $f20, $f20, $f21, $fcc0
261
+ addi.d i1, i1, 1
262
+ movgr2fr.d $f21, i1
263
+ blt $r0, I, .L22
264
+ movfr2gr.d i0, $f20
265
+ .align 3
266
+
267
+ .L999:
268
+ move $r4, $r17
269
+ jirl $r0, $r1, 0x0
270
+ .align 3
271
+
272
+ EPILOGUE
0 commit comments