Skip to content

Commit e3fb2b5

Browse files
yanchengyinshiyou
authored andcommitted
loongarch64: Add optimizations for imin.
1 parent e46b48e commit e3fb2b5

File tree

6 files changed

+1148
-0
lines changed

6 files changed

+1148
-0
lines changed

kernel/loongarch64/KERNEL.LOONGSON2K1000

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,4 +22,7 @@ DMINKERNEL = dmin_lsx.S
2222
ISMAXKERNEL = ismax_lsx.S
2323
IDMAXKERNEL = idmax_lsx.S
2424

25+
ISMINKERNEL = ismin_lsx.S
26+
IDMINKERNEL = idmin_lsx.S
27+
2528
endif

kernel/loongarch64/KERNEL.LOONGSON3R5

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,9 @@ DMINKERNEL = dmin_lasx.S
2222
ISMAXKERNEL = ismax_lasx.S
2323
IDMAXKERNEL = idmax_lasx.S
2424

25+
ISMINKERNEL = ismin_lasx.S
26+
IDMINKERNEL = idmin_lasx.S
27+
2528
DGEMMKERNEL = dgemm_kernel_16x4.S
2629
DGEMMINCOPY = dgemm_ncopy_16.S
2730
DGEMMITCOPY = dgemm_tcopy_16.S

kernel/loongarch64/idmin_lasx.S

Lines changed: 272 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,272 @@
1+
#define ASSEMBLER
2+
3+
#include "common.h"
4+
5+
#define N $r4
6+
#define X $r5
7+
#define INCX $r6
8+
#define I $r12
9+
#define t1 $r13
10+
#define t2 $r15
11+
#define t3 $r18
12+
#define t4 $r16
13+
#define i0 $r17
14+
#define i1 $r14
15+
#define TEMP $r19
16+
#define x1 $xr9
17+
#define x2 $xr10
18+
#define x3 $xr11
19+
#define x4 $xr12
20+
#define VX0 $xr13
21+
#define VX1 $xr14
22+
#define VM0 $xr15
23+
#define VM1 $xr16
24+
#define VINC4 $xr17
25+
#define VINC8 $xr18
26+
#define VI0 $xr20
27+
#define VI1 $xr21
28+
#define VI2 $xr22
29+
#define VI3 $xr8
30+
#define VI4 $xr19
31+
#define VT0 $xr23
32+
33+
PROLOGUE
34+
li.d i0, 0
35+
bge $r0, N, .L999
36+
bge $r0, INCX, .L999
37+
li.d TEMP, 1
38+
slli.d TEMP, TEMP, BASE_SHIFT
39+
slli.d INCX, INCX, BASE_SHIFT
40+
bne INCX, TEMP, .L20
41+
xvld VM0, X, 0
42+
addi.d i0, i0, 1
43+
srai.d I, N, 3
44+
bge $r0, I, .L21
45+
slli.d i0, i0, 2 //4
46+
xvreplgr2vr.d VINC4, i0
47+
slli.d i0, i0, 1 //8
48+
xvreplgr2vr.d VINC8, i0
49+
addi.d i0, i0, -15
50+
xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
51+
addi.d i0, i0, 1
52+
xvinsgr2vr.d VI1, i0, 1
53+
addi.d i0, i0, 1
54+
xvinsgr2vr.d VI1, i0, 2
55+
addi.d i0, i0, 1
56+
xvinsgr2vr.d VI1, i0, 3
57+
addi.d i0, i0, 5
58+
xvinsgr2vr.d VI0, i0, 0 //1
59+
addi.d i0, i0, 1
60+
xvinsgr2vr.d VI0, i0, 1 //2
61+
addi.d i0, i0, 1
62+
xvinsgr2vr.d VI0, i0, 2 //3
63+
addi.d i0, i0, 1
64+
xvinsgr2vr.d VI0, i0, 3 //4
65+
.align 3
66+
67+
.L10:
68+
xvld VX0, X, 0 * SIZE
69+
xvadd.d VI1, VI1, VINC8
70+
xvld VX1, X, 4 * SIZE
71+
xvadd.d VI2, VI1, VINC4
72+
xvfcmp.clt.d VT0, VX1, VX0
73+
addi.d I, I, -1
74+
xvbitsel.v VM1, VX0, VX1, VT0
75+
xvbitsel.v VI2, VI1, VI2, VT0
76+
xvfcmp.clt.d VT0, VM1, VM0
77+
addi.d X, X, 8 * SIZE
78+
xvbitsel.v VM0, VM0, VM1, VT0
79+
xvbitsel.v VI0, VI0, VI2, VT0
80+
blt $r0, I, .L10
81+
.align 3
82+
83+
.L15:
84+
xvpickve.d VI1, VI0, 0
85+
xvpickve.d VI2, VI0, 1
86+
xvpickve.d VI3, VI0, 2
87+
xvpickve.d VI4, VI0, 3
88+
xvpickve.d x1, VM0, 0
89+
xvpickve.d x2, VM0, 1
90+
xvpickve.d x3, VM0, 2
91+
xvpickve.d x4, VM0, 3
92+
xvfcmp.clt.d VT0, x2, x1
93+
xvbitsel.v VM1, x1, x2, VT0
94+
xvbitsel.v VINC4, VI1, VI2, VT0
95+
xvfcmp.clt.d VT0, x4, x3
96+
xvbitsel.v VM0, x3, x4, VT0
97+
xvbitsel.v VINC8, VI3, VI4, VT0
98+
xvfcmp.clt.d VT0, VM1, VM0
99+
xvbitsel.v VM0, VM0, VM1, VT0
100+
xvbitsel.v VI0, VINC8, VINC4, VT0
101+
li.d TEMP, 1 //处理尾数相等时取最小序号
102+
movgr2fr.d $f17, TEMP
103+
ffint.d.l $f17, $f17
104+
xvfcmp.ceq.d VT0, VM0, x1
105+
fcmp.ceq.d $fcc0, $f23, $f17
106+
bceqz $fcc0, .L26
107+
xvfcmp.clt.d VT0, VI1, VI0
108+
xvbitsel.v VI0, VI0, VI1, VT0
109+
b .L26
110+
.align 3
111+
112+
.L20: // INCX!=1
113+
move TEMP, X
114+
addi.d i0, i0, 1
115+
ld.d t1, TEMP, 0 * SIZE
116+
add.d TEMP, TEMP, INCX
117+
xvinsgr2vr.d VM0, t1, 0
118+
srai.d I, N, 3
119+
bge $r0, I, .L21
120+
ld.d t2, TEMP, 0 * SIZE
121+
add.d TEMP, TEMP, INCX
122+
ld.d t3, TEMP, 0 * SIZE
123+
add.d TEMP, TEMP, INCX
124+
ld.d t4, TEMP, 0 * SIZE
125+
add.d TEMP, TEMP, INCX
126+
xvinsgr2vr.d VM0, t2, 1
127+
xvinsgr2vr.d VM0, t3, 2
128+
xvinsgr2vr.d VM0, t4, 3
129+
slli.d i0, i0, 2 //4
130+
xvreplgr2vr.d VINC4, i0
131+
slli.d i0, i0, 1 //8
132+
xvreplgr2vr.d VINC8, i0
133+
addi.d i0, i0, -15
134+
xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
135+
addi.d i0, i0, 1
136+
xvinsgr2vr.d VI1, i0, 1
137+
addi.d i0, i0, 1
138+
xvinsgr2vr.d VI1, i0, 2
139+
addi.d i0, i0, 1
140+
xvinsgr2vr.d VI1, i0, 3
141+
addi.d i0, i0, 5
142+
xvinsgr2vr.d VI0, i0, 0 //1
143+
addi.d i0, i0, 1
144+
xvinsgr2vr.d VI0, i0, 1 //2
145+
addi.d i0, i0, 1
146+
xvinsgr2vr.d VI0, i0, 2 //3
147+
addi.d i0, i0, 1
148+
xvinsgr2vr.d VI0, i0, 3 //4
149+
.align 3
150+
151+
.L24:
152+
ld.d t1, X, 0 * SIZE
153+
add.d X, X, INCX
154+
ld.d t2, X, 0 * SIZE
155+
add.d X, X, INCX
156+
ld.d t3, X, 0 * SIZE
157+
add.d X, X, INCX
158+
ld.d t4, X, 0 * SIZE
159+
add.d X, X, INCX
160+
xvinsgr2vr.d VX0, t1, 0
161+
xvinsgr2vr.d VX0, t2, 1
162+
xvinsgr2vr.d VX0, t3, 2
163+
xvinsgr2vr.d VX0, t4, 3
164+
xvadd.d VI1, VI1, VINC8
165+
ld.d t1, X, 0 * SIZE
166+
add.d X, X, INCX
167+
ld.d t2, X, 0 * SIZE
168+
add.d X, X, INCX
169+
ld.d t3, X, 0 * SIZE
170+
add.d X, X, INCX
171+
ld.d t4, X, 0 * SIZE
172+
add.d X, X, INCX
173+
xvinsgr2vr.d VX1, t1, 0
174+
xvinsgr2vr.d VX1, t2, 1
175+
xvinsgr2vr.d VX1, t3, 2
176+
xvinsgr2vr.d VX1, t4, 3
177+
xvadd.d VI2, VI1, VINC4
178+
xvfcmp.clt.d VT0, VX1, VX0
179+
addi.d I, I, -1
180+
xvbitsel.v VM1, VX0, VX1, VT0
181+
xvbitsel.v VI2, VI1, VI2, VT0
182+
xvfcmp.clt.d VT0, VM1, VM0
183+
xvbitsel.v VM0, VM0, VM1, VT0
184+
xvbitsel.v VI0, VI0, VI2, VT0
185+
blt $r0, I, .L24
186+
.align 3
187+
188+
.L25:
189+
xvpickve.d VI1, VI0, 0
190+
xvpickve.d VI2, VI0, 1
191+
xvpickve.d VI3, VI0, 2
192+
xvpickve.d VI4, VI0, 3
193+
xvpickve.d x1, VM0, 0
194+
xvpickve.d x2, VM0, 1
195+
xvpickve.d x3, VM0, 2
196+
xvpickve.d x4, VM0, 3
197+
xvfcmp.clt.d VT0, x2, x1
198+
xvbitsel.v VM1, x1, x2, VT0
199+
xvbitsel.v VINC4, VI1, VI2, VT0
200+
xvfcmp.clt.d VT0, x4, x3
201+
xvbitsel.v VM0, x3, x4, VT0
202+
xvbitsel.v VINC8, VI3, VI4, VT0
203+
xvfcmp.clt.d VT0, VM1, VM0
204+
xvbitsel.v VM0, VM0, VM1, VT0
205+
xvbitsel.v VI0, VINC8, VINC4, VT0
206+
li.d TEMP, 1 //处理尾数相等时取最小序号
207+
movgr2fr.d $f17, TEMP
208+
ffint.d.l $f17, $f17
209+
xvfcmp.ceq.d VT0, VM0, x1
210+
fcmp.ceq.d $fcc0, $f23, $f17
211+
bceqz $fcc0, .L26
212+
xvfcmp.clt.d VT0, VI1, VI0
213+
xvbitsel.v VI0, VI0, VI1, VT0
214+
.align 3
215+
216+
.L26:
217+
xvfcmp.ceq.d VT0, VM0, x2
218+
fcmp.ceq.d $fcc0, $f23, $f17
219+
bceqz $fcc0, .L27
220+
xvfcmp.clt.d VT0, VI2, VI0
221+
xvbitsel.v VI0, VI0, VI2, VT0
222+
.align 3
223+
224+
.L27:
225+
xvfcmp.ceq.d VT0, VM0, x3
226+
fcmp.ceq.d $fcc0, $f23, $f17
227+
bceqz $fcc0, .L28
228+
xvfcmp.clt.d VT0, VI3, VI0
229+
xvbitsel.v VI0, VI0, VI3, VT0
230+
.align 3
231+
232+
.L28:
233+
xvfcmp.ceq.d VT0, VM0, x4
234+
fcmp.ceq.d $fcc0, $f23, $f17
235+
bceqz $fcc0, .L29
236+
xvfcmp.clt.d VT0, VI4, VI0
237+
xvbitsel.v VI0, VI0, VI4, VT0
238+
.align 3
239+
240+
.L29:
241+
movfr2gr.d i0, $f20
242+
.align 3
243+
244+
.L21: //N<8
245+
andi I, N, 7
246+
bge $r0, I, .L999
247+
srai.d i1, N, 3
248+
slli.d i1, i1, 3
249+
addi.d i1, i1, 1 //current index
250+
movgr2fr.d $f21, i1
251+
movgr2fr.d $f20, i0
252+
.align 3
253+
254+
.L22:
255+
fld.d $f9, X, 0
256+
addi.d I, I, -1
257+
fcmp.clt.d $fcc0, $f9, $f15
258+
add.d X, X, INCX
259+
fsel $f15, $f15, $f9, $fcc0
260+
fsel $f20, $f20, $f21, $fcc0
261+
addi.d i1, i1, 1
262+
movgr2fr.d $f21, i1
263+
blt $r0, I, .L22
264+
movfr2gr.d i0, $f20
265+
.align 3
266+
267+
.L999:
268+
move $r4, $r17
269+
jirl $r0, $r1, 0x0
270+
.align 3
271+
272+
EPILOGUE

0 commit comments

Comments
 (0)