Skip to content

Commit e46b48e

Browse files
yanchengyinshiyou
authored andcommitted
loongarch64: Add optimizations for imax.
1 parent 702fc1d commit e46b48e

File tree

6 files changed

+1151
-0
lines changed

6 files changed

+1151
-0
lines changed

kernel/loongarch64/KERNEL.LOONGSON2K1000

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,4 +19,7 @@ DMAXKERNEL = dmax_lsx.S
1919
SMINKERNEL = smin_lsx.S
2020
DMINKERNEL = dmin_lsx.S
2121

22+
ISMAXKERNEL = ismax_lsx.S
23+
IDMAXKERNEL = idmax_lsx.S
24+
2225
endif

kernel/loongarch64/KERNEL.LOONGSON3R5

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,9 @@ DMAXKERNEL = dmax_lasx.S
1919
SMINKERNEL = smin_lasx.S
2020
DMINKERNEL = dmin_lasx.S
2121

22+
ISMAXKERNEL = ismax_lasx.S
23+
IDMAXKERNEL = idmax_lasx.S
24+
2225
DGEMMKERNEL = dgemm_kernel_16x4.S
2326
DGEMMINCOPY = dgemm_ncopy_16.S
2427
DGEMMITCOPY = dgemm_tcopy_16.S

kernel/loongarch64/idmax_lasx.S

Lines changed: 273 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,273 @@
1+
#define ASSEMBLER
2+
3+
#include "common.h"
4+
5+
#define N $r4
6+
#define X $r5
7+
#define INCX $r6
8+
#define I $r12
9+
#define t1 $r13
10+
#define t2 $r15
11+
#define t3 $r18
12+
#define t4 $r16
13+
#define i0 $r17
14+
#define i1 $r14
15+
#define TEMP $r19
16+
#define x1 $xr9
17+
#define x2 $xr10
18+
#define x3 $xr11
19+
#define x4 $xr12
20+
#define VX0 $xr13
21+
#define VX1 $xr14
22+
#define VM0 $xr15
23+
#define VM1 $xr16
24+
#define VINC4 $xr17
25+
#define VINC8 $xr18
26+
#define VI0 $xr20
27+
#define VI1 $xr21
28+
#define VI2 $xr22
29+
#define VI3 $xr8
30+
#define VI4 $xr19
31+
#define VT0 $xr23
32+
33+
PROLOGUE
34+
li.d i0, 0
35+
bge $r0, N, .L999
36+
bge $r0, INCX, .L999
37+
li.d TEMP, 1
38+
slli.d TEMP, TEMP, BASE_SHIFT
39+
slli.d INCX, INCX, BASE_SHIFT
40+
bne INCX, TEMP, .L20
41+
xvld VM0, X, 0
42+
addi.d i0, i0, 1
43+
srai.d I, N, 3
44+
bge $r0, I, .L21
45+
slli.d i0, i0, 2 //4
46+
xvreplgr2vr.d VINC4, i0
47+
slli.d i0, i0, 1 //8
48+
xvreplgr2vr.d VINC8, i0
49+
addi.d i0, i0, -15
50+
xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
51+
addi.d i0, i0, 1
52+
xvinsgr2vr.d VI1, i0, 1
53+
addi.d i0, i0, 1
54+
xvinsgr2vr.d VI1, i0, 2
55+
addi.d i0, i0, 1
56+
xvinsgr2vr.d VI1, i0, 3
57+
addi.d i0, i0, 5
58+
xvinsgr2vr.d VI0, i0, 0 //1
59+
addi.d i0, i0, 1
60+
xvinsgr2vr.d VI0, i0, 1 //2
61+
addi.d i0, i0, 1
62+
xvinsgr2vr.d VI0, i0, 2 //3
63+
addi.d i0, i0, 1
64+
xvinsgr2vr.d VI0, i0, 3 //4
65+
.align 3
66+
67+
.L10:
68+
xvld VX0, X, 0 * SIZE
69+
xvadd.d VI1, VI1, VINC8
70+
xvld VX1, X, 4 * SIZE
71+
xvadd.d VI2, VI1, VINC4
72+
xvfcmp.clt.d VT0, VX0, VX1
73+
addi.d I, I, -1
74+
xvbitsel.v VM1, VX0, VX1, VT0
75+
xvbitsel.v VI2, VI1, VI2, VT0
76+
xvfcmp.clt.d VT0, VM0, VM1
77+
addi.d X, X, 8 * SIZE
78+
xvbitsel.v VM0, VM0, VM1, VT0
79+
xvbitsel.v VI0, VI0, VI2, VT0
80+
blt $r0, I, .L10
81+
.align 3
82+
83+
.L15:
84+
xvpickve.d VI1, VI0, 0
85+
xvpickve.d VI2, VI0, 1
86+
xvpickve.d VI3, VI0, 2
87+
xvpickve.d VI4, VI0, 3
88+
xvpickve.d x1, VM0, 0
89+
xvpickve.d x2, VM0, 1
90+
xvpickve.d x3, VM0, 2
91+
xvpickve.d x4, VM0, 3
92+
xvfcmp.clt.d VT0, x1, x2
93+
xvbitsel.v VM1, x1, x2, VT0
94+
xvbitsel.v VINC4, VI1, VI2, VT0
95+
xvfcmp.clt.d VT0, x3, x4
96+
xvbitsel.v VM0, x3, x4, VT0
97+
xvbitsel.v VINC8, VI3, VI4, VT0
98+
xvfcmp.clt.d VT0, VM0, VM1
99+
xvbitsel.v VM0, VM0, VM1, VT0
100+
xvbitsel.v VI0, VINC8, VINC4, VT0
101+
li.d TEMP, 1 //处理尾数相等时取最小序号
102+
movgr2fr.d $f17, TEMP
103+
ffint.d.l $f17, $f17
104+
xvfcmp.ceq.d VT0, VM0, x1
105+
fcmp.ceq.d $fcc0, $f23, $f17
106+
bceqz $fcc0, .L26
107+
xvfcmp.clt.d VT0, VI1, VI0
108+
xvbitsel.v VI0, VI0, VI1, VT0
109+
b .L26
110+
.align 3
111+
112+
113+
.L20: // INCX!=1
114+
move TEMP, X
115+
addi.d i0, i0, 1
116+
ld.d t1, TEMP, 0 * SIZE
117+
add.d TEMP, TEMP, INCX
118+
xvinsgr2vr.d VM0, t1, 0
119+
srai.d I, N, 3
120+
bge $r0, I, .L21
121+
ld.d t2, TEMP, 0 * SIZE
122+
add.d TEMP, TEMP, INCX
123+
ld.d t3, TEMP, 0 * SIZE
124+
add.d TEMP, TEMP, INCX
125+
ld.d t4, TEMP, 0 * SIZE
126+
add.d TEMP, TEMP, INCX
127+
xvinsgr2vr.d VM0, t2, 1
128+
xvinsgr2vr.d VM0, t3, 2
129+
xvinsgr2vr.d VM0, t4, 3
130+
slli.d i0, i0, 2 //4
131+
xvreplgr2vr.d VINC4, i0
132+
slli.d i0, i0, 1 //8
133+
xvreplgr2vr.d VINC8, i0
134+
addi.d i0, i0, -15
135+
xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
136+
addi.d i0, i0, 1
137+
xvinsgr2vr.d VI1, i0, 1
138+
addi.d i0, i0, 1
139+
xvinsgr2vr.d VI1, i0, 2
140+
addi.d i0, i0, 1
141+
xvinsgr2vr.d VI1, i0, 3
142+
addi.d i0, i0, 5
143+
xvinsgr2vr.d VI0, i0, 0 //1
144+
addi.d i0, i0, 1
145+
xvinsgr2vr.d VI0, i0, 1 //2
146+
addi.d i0, i0, 1
147+
xvinsgr2vr.d VI0, i0, 2 //3
148+
addi.d i0, i0, 1
149+
xvinsgr2vr.d VI0, i0, 3 //4
150+
.align 3
151+
152+
.L24:
153+
ld.d t1, X, 0 * SIZE
154+
add.d X, X, INCX
155+
ld.d t2, X, 0 * SIZE
156+
add.d X, X, INCX
157+
ld.d t3, X, 0 * SIZE
158+
add.d X, X, INCX
159+
ld.d t4, X, 0 * SIZE
160+
add.d X, X, INCX
161+
xvinsgr2vr.d VX0, t1, 0
162+
xvinsgr2vr.d VX0, t2, 1
163+
xvinsgr2vr.d VX0, t3, 2
164+
xvinsgr2vr.d VX0, t4, 3
165+
xvadd.d VI1, VI1, VINC8
166+
ld.d t1, X, 0 * SIZE
167+
add.d X, X, INCX
168+
ld.d t2, X, 0 * SIZE
169+
add.d X, X, INCX
170+
ld.d t3, X, 0 * SIZE
171+
add.d X, X, INCX
172+
ld.d t4, X, 0 * SIZE
173+
add.d X, X, INCX
174+
xvinsgr2vr.d VX1, t1, 0
175+
xvinsgr2vr.d VX1, t2, 1
176+
xvinsgr2vr.d VX1, t3, 2
177+
xvinsgr2vr.d VX1, t4, 3
178+
xvadd.d VI2, VI1, VINC4
179+
xvfcmp.clt.d VT0, VX0, VX1
180+
addi.d I, I, -1
181+
xvbitsel.v VM1, VX0, VX1, VT0
182+
xvbitsel.v VI2, VI1, VI2, VT0
183+
xvfcmp.clt.d VT0, VM0, VM1
184+
xvbitsel.v VM0, VM0, VM1, VT0
185+
xvbitsel.v VI0, VI0, VI2, VT0
186+
blt $r0, I, .L24
187+
.align 3
188+
189+
.L25:
190+
xvpickve.d VI1, VI0, 0
191+
xvpickve.d VI2, VI0, 1
192+
xvpickve.d VI3, VI0, 2
193+
xvpickve.d VI4, VI0, 3
194+
xvpickve.d x1, VM0, 0
195+
xvpickve.d x2, VM0, 1
196+
xvpickve.d x3, VM0, 2
197+
xvpickve.d x4, VM0, 3
198+
xvfcmp.clt.d VT0, x1, x2
199+
xvbitsel.v VM1, x1, x2, VT0
200+
xvbitsel.v VINC4, VI1, VI2, VT0
201+
xvfcmp.clt.d VT0, x3, x4
202+
xvbitsel.v VM0, x3, x4, VT0
203+
xvbitsel.v VINC8, VI3, VI4, VT0
204+
xvfcmp.clt.d VT0, VM0, VM1
205+
xvbitsel.v VM0, VM0, VM1, VT0
206+
xvbitsel.v VI0, VINC8, VINC4, VT0
207+
li.d TEMP, 1 //处理尾数相等时取最小序号
208+
movgr2fr.d $f17, TEMP
209+
ffint.d.l $f17, $f17
210+
xvfcmp.ceq.d VT0, VM0, x1
211+
fcmp.ceq.d $fcc0, $f23, $f17
212+
bceqz $fcc0, .L26
213+
xvfcmp.clt.d VT0, VI1, VI0
214+
xvbitsel.v VI0, VI0, VI1, VT0
215+
.align 3
216+
217+
.L26:
218+
xvfcmp.ceq.d VT0, VM0, x2
219+
fcmp.ceq.d $fcc0, $f23, $f17
220+
bceqz $fcc0, .L27
221+
xvfcmp.clt.d VT0, VI2, VI0
222+
xvbitsel.v VI0, VI0, VI2, VT0
223+
.align 3
224+
225+
.L27:
226+
xvfcmp.ceq.d VT0, VM0, x3
227+
fcmp.ceq.d $fcc0, $f23, $f17
228+
bceqz $fcc0, .L28
229+
xvfcmp.clt.d VT0, VI3, VI0
230+
xvbitsel.v VI0, VI0, VI3, VT0
231+
.align 3
232+
233+
.L28:
234+
xvfcmp.ceq.d VT0, VM0, x4
235+
fcmp.ceq.d $fcc0, $f23, $f17
236+
bceqz $fcc0, .L29
237+
xvfcmp.clt.d VT0, VI4, VI0
238+
xvbitsel.v VI0, VI0, VI4, VT0
239+
.align 3
240+
241+
.L29:
242+
movfr2gr.d i0, $f20
243+
.align 3
244+
245+
.L21: //N<8
246+
andi I, N, 7
247+
bge $r0, I, .L999
248+
srai.d i1, N, 3
249+
slli.d i1, i1, 3
250+
addi.d i1, i1, 1 //current index
251+
movgr2fr.d $f21, i1
252+
movgr2fr.d $f20, i0
253+
.align 3
254+
255+
.L22:
256+
fld.d $f9, X, 0
257+
addi.d I, I, -1
258+
fcmp.clt.d $fcc0, $f15, $f9
259+
add.d X, X, INCX
260+
fsel $f15, $f15, $f9, $fcc0
261+
fsel $f20, $f20, $f21, $fcc0
262+
addi.d i1, i1, 1
263+
movgr2fr.d $f21, i1
264+
blt $r0, I, .L22
265+
movfr2gr.d i0, $f20
266+
.align 3
267+
268+
.L999:
269+
move $r4, $r17
270+
jirl $r0, $r1, 0x0
271+
.align 3
272+
273+
EPILOGUE

0 commit comments

Comments
 (0)