Skip to content

Commit 49829b2

Browse files
yanchengyinshiyou
authored andcommitted
loongarch64: Add optimizations for iamin.
1 parent be83f5e commit 49829b2

File tree

6 files changed

+1162
-0
lines changed

6 files changed

+1162
-0
lines changed

kernel/loongarch64/KERNEL.LOONGSON2K1000

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,4 +28,7 @@ IDMINKERNEL = idmin_lsx.S
2828
ISAMAXKERNEL = isamax_lsx.S
2929
IDAMAXKERNEL = idamax_lsx.S
3030

31+
ISAMINKERNEL = isamin_lsx.S
32+
IDAMINKERNEL = idamin_lsx.S
33+
3134
endif

kernel/loongarch64/KERNEL.LOONGSON3R5

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,9 @@ IDMINKERNEL = idmin_lasx.S
2828
ISAMAXKERNEL = isamax_lasx.S
2929
IDAMAXKERNEL = idamax_lasx.S
3030

31+
ISAMINKERNEL = isamin_lasx.S
32+
IDAMINKERNEL = idamin_lasx.S
33+
3134
DGEMMKERNEL = dgemm_kernel_16x4.S
3235
DGEMMINCOPY = dgemm_ncopy_16.S
3336
DGEMMITCOPY = dgemm_tcopy_16.S

kernel/loongarch64/idamin_lasx.S

Lines changed: 275 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,275 @@
1+
#define ASSEMBLER
2+
3+
#include "common.h"
4+
5+
#define N $r4
6+
#define X $r5
7+
#define INCX $r6
8+
#define I $r12
9+
#define t1 $r13
10+
#define t2 $r15
11+
#define t3 $r18
12+
#define t4 $r16
13+
#define i0 $r17
14+
#define i1 $r14
15+
#define TEMP $r19
16+
#define x1 $xr9
17+
#define x2 $xr10
18+
#define x3 $xr11
19+
#define x4 $xr12
20+
#define VX0 $xr13
21+
#define VX1 $xr14
22+
#define VM0 $xr15
23+
#define VM1 $xr16
24+
#define VINC4 $xr17
25+
#define VINC8 $xr18
26+
#define VI0 $xr20
27+
#define VI1 $xr21
28+
#define VI2 $xr22
29+
#define VI3 $xr8
30+
#define VI4 $xr19
31+
#define VT0 $xr23
32+
33+
PROLOGUE
34+
li.d i0, 0
35+
bge $r0, N, .L999
36+
bge $r0, INCX, .L999
37+
li.d TEMP, 1
38+
slli.d TEMP, TEMP, BASE_SHIFT
39+
slli.d INCX, INCX, BASE_SHIFT
40+
bne INCX, TEMP, .L20
41+
xvld VM0, X, 0
42+
addi.d i0, i0, 1
43+
srai.d I, N, 3
44+
bge $r0, I, .L21
45+
slli.d i0, i0, 2 //4
46+
xvreplgr2vr.d VINC4, i0
47+
slli.d i0, i0, 1 //8
48+
xvreplgr2vr.d VINC8, i0
49+
addi.d i0, i0, -15
50+
xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
51+
addi.d i0, i0, 1
52+
xvinsgr2vr.d VI1, i0, 1
53+
addi.d i0, i0, 1
54+
xvinsgr2vr.d VI1, i0, 2
55+
addi.d i0, i0, 1
56+
xvinsgr2vr.d VI1, i0, 3
57+
addi.d i0, i0, 5
58+
xvinsgr2vr.d VI0, i0, 0 //1
59+
addi.d i0, i0, 1
60+
xvinsgr2vr.d VI0, i0, 1 //2
61+
addi.d i0, i0, 1
62+
xvinsgr2vr.d VI0, i0, 2 //3
63+
addi.d i0, i0, 1
64+
xvinsgr2vr.d VI0, i0, 3 //4
65+
.align 3
66+
67+
.L10:
68+
xvld VX0, X, 0 * SIZE
69+
xvadd.d VI1, VI1, VINC8
70+
xvld VX1, X, 4 * SIZE
71+
xvadd.d VI2, VI1, VINC4
72+
xvfmina.d VM1, VX0, VX1
73+
xvfcmp.ceq.d VT0, VX0, VM1
74+
addi.d I, I, -1
75+
xvbitsel.v VI2, VI2, VI1, VT0
76+
xvfmina.d VM1, VM0, VM1
77+
xvfcmp.ceq.d VT0, VM0, VM1
78+
addi.d X, X, 8 * SIZE
79+
xvbitsel.v VM0, VM1, VM0, VT0
80+
xvbitsel.v VI0, VI2, VI0, VT0
81+
blt $r0, I, .L10
82+
.align 3
83+
84+
.L15:
85+
xvpickve.d VI1, VI0, 0
86+
xvpickve.d VI2, VI0, 1
87+
xvpickve.d VI3, VI0, 2
88+
xvpickve.d VI4, VI0, 3
89+
xvpickve.d x1, VM0, 0
90+
xvpickve.d x2, VM0, 1
91+
xvpickve.d x3, VM0, 2
92+
xvpickve.d x4, VM0, 3
93+
xvfmina.d VM1, x1, x2
94+
xvfcmp.ceq.d VT0, x1, VM1
95+
xvbitsel.v VINC4, VI2, VI1, VT0
96+
xvfmina.d VM0, x4, x3
97+
xvfcmp.ceq.d VT0, x3, VM0
98+
xvbitsel.v VINC8, VI4, VI3, VT0
99+
xvfmina.d VM0, VM0, VM1
100+
xvfcmp.ceq.d VT0, VM0, VM1
101+
xvbitsel.v VI0, VINC8, VINC4, VT0
102+
li.d TEMP, 1 //处理尾数相等时取最小序号
103+
movgr2fr.d $f17, TEMP
104+
ffint.d.l $f17, $f17
105+
xvfcmp.ceq.d VT0, VM0, x1
106+
fcmp.ceq.d $fcc0, $f23, $f17
107+
bceqz $fcc0, .L26
108+
xvfcmp.clt.d VT0, VI1, VI0
109+
xvbitsel.v VI0, VI0, VI1, VT0
110+
b .L26
111+
.align 3
112+
113+
.L20: // INCX!=1
114+
move TEMP, X
115+
addi.d i0, i0, 1
116+
ld.d t1, TEMP, 0 * SIZE
117+
add.d TEMP, TEMP, INCX
118+
xvinsgr2vr.d VM0, t1, 0
119+
srai.d I, N, 3
120+
bge $r0, I, .L21
121+
ld.d t2, TEMP, 0 * SIZE
122+
add.d TEMP, TEMP, INCX
123+
ld.d t3, TEMP, 0 * SIZE
124+
add.d TEMP, TEMP, INCX
125+
ld.d t4, TEMP, 0 * SIZE
126+
add.d TEMP, TEMP, INCX
127+
xvinsgr2vr.d VM0, t2, 1
128+
xvinsgr2vr.d VM0, t3, 2
129+
xvinsgr2vr.d VM0, t4, 3
130+
slli.d i0, i0, 2 //4
131+
xvreplgr2vr.d VINC4, i0
132+
slli.d i0, i0, 1 //8
133+
xvreplgr2vr.d VINC8, i0
134+
addi.d i0, i0, -15
135+
xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
136+
addi.d i0, i0, 1
137+
xvinsgr2vr.d VI1, i0, 1
138+
addi.d i0, i0, 1
139+
xvinsgr2vr.d VI1, i0, 2
140+
addi.d i0, i0, 1
141+
xvinsgr2vr.d VI1, i0, 3
142+
addi.d i0, i0, 5
143+
xvinsgr2vr.d VI0, i0, 0 //1
144+
addi.d i0, i0, 1
145+
xvinsgr2vr.d VI0, i0, 1 //2
146+
addi.d i0, i0, 1
147+
xvinsgr2vr.d VI0, i0, 2 //3
148+
addi.d i0, i0, 1
149+
xvinsgr2vr.d VI0, i0, 3 //4
150+
.align 3
151+
152+
.L24:
153+
ld.d t1, X, 0 * SIZE
154+
add.d X, X, INCX
155+
ld.d t2, X, 0 * SIZE
156+
add.d X, X, INCX
157+
ld.d t3, X, 0 * SIZE
158+
add.d X, X, INCX
159+
ld.d t4, X, 0 * SIZE
160+
add.d X, X, INCX
161+
xvinsgr2vr.d VX0, t1, 0
162+
xvinsgr2vr.d VX0, t2, 1
163+
xvinsgr2vr.d VX0, t3, 2
164+
xvinsgr2vr.d VX0, t4, 3
165+
xvadd.d VI1, VI1, VINC8
166+
ld.d t1, X, 0 * SIZE
167+
add.d X, X, INCX
168+
ld.d t2, X, 0 * SIZE
169+
add.d X, X, INCX
170+
ld.d t3, X, 0 * SIZE
171+
add.d X, X, INCX
172+
ld.d t4, X, 0 * SIZE
173+
add.d X, X, INCX
174+
xvinsgr2vr.d VX1, t1, 0
175+
xvinsgr2vr.d VX1, t2, 1
176+
xvinsgr2vr.d VX1, t3, 2
177+
xvinsgr2vr.d VX1, t4, 3
178+
xvadd.d VI2, VI1, VINC4
179+
xvfmina.d VM1, VX0, VX1
180+
xvfcmp.ceq.d VT0, VX0, VM1
181+
xvbitsel.v VI2, VI2, VI1, VT0
182+
xvfmina.d VM1, VM0, VM1
183+
xvfcmp.ceq.d VT0, VM0, VM1
184+
addi.d I, I, -1
185+
xvbitsel.v VM0, VM1, VM0, VT0
186+
xvbitsel.v VI0, VI2, VI0, VT0
187+
blt $r0, I, .L24
188+
.align 3
189+
190+
.L25:
191+
xvpickve.d VI1, VI0, 0
192+
xvpickve.d VI2, VI0, 1
193+
xvpickve.d VI3, VI0, 2
194+
xvpickve.d VI4, VI0, 3
195+
xvpickve.d x1, VM0, 0
196+
xvpickve.d x2, VM0, 1
197+
xvpickve.d x3, VM0, 2
198+
xvpickve.d x4, VM0, 3
199+
xvfmina.d VM1, x1, x2
200+
xvfcmp.ceq.d VT0, x1, VM1
201+
xvbitsel.v VINC4, VI2, VI1, VT0
202+
xvfmina.d VM0, x4, x3
203+
xvfcmp.ceq.d VT0, x3, VM0
204+
xvbitsel.v VINC8, VI4, VI3, VT0
205+
xvfmina.d VM0, VM0, VM1
206+
xvfcmp.ceq.d VT0, VM0, VM1
207+
xvbitsel.v VI0, VINC8, VINC4, VT0
208+
li.d TEMP, 1 //处理尾数相等时取最小序号
209+
movgr2fr.d $f17, TEMP
210+
ffint.d.l $f17, $f17
211+
xvfcmp.ceq.d VT0, VM0, x1
212+
fcmp.ceq.d $fcc0, $f23, $f17
213+
bceqz $fcc0, .L26
214+
xvfcmp.clt.d VT0, VI1, VI0
215+
xvbitsel.v VI0, VI0, VI1, VT0
216+
.align 3
217+
218+
.L26:
219+
xvfcmp.ceq.d VT0, VM0, x2
220+
fcmp.ceq.d $fcc0, $f23, $f17
221+
bceqz $fcc0, .L27
222+
xvfcmp.clt.d VT0, VI2, VI0
223+
xvbitsel.v VI0, VI0, VI2, VT0
224+
.align 3
225+
226+
.L27:
227+
xvfcmp.ceq.d VT0, VM0, x3
228+
fcmp.ceq.d $fcc0, $f23, $f17
229+
bceqz $fcc0, .L28
230+
xvfcmp.clt.d VT0, VI3, VI0
231+
xvbitsel.v VI0, VI0, VI3, VT0
232+
.align 3
233+
234+
.L28:
235+
xvfcmp.ceq.d VT0, VM0, x4
236+
fcmp.ceq.d $fcc0, $f23, $f17
237+
bceqz $fcc0, .L29
238+
xvfcmp.clt.d VT0, VI4, VI0
239+
xvbitsel.v VI0, VI0, VI4, VT0
240+
.align 3
241+
242+
.L29:
243+
movfr2gr.d i0, $f20
244+
.align 3
245+
246+
.L21: // N<8
247+
andi I, N, 7
248+
bge $r0, I, .L999
249+
srai.d i1, N, 3
250+
slli.d i1, i1, 3
251+
addi.d i1, i1, 1 //current index
252+
movgr2fr.d $f21, i1
253+
movgr2fr.d $f20, i0
254+
.align 3
255+
256+
.L22:
257+
fld.d $f9, X, 0
258+
addi.d I, I, -1
259+
xvfmina.d VM1, x1, VM0
260+
xvfcmp.ceq.d VT0, VM0, VM1
261+
add.d X, X, INCX
262+
xvbitsel.v VM0, VM1, VM0, VT0
263+
xvbitsel.v VI0, VI1, VI0, VT0
264+
addi.d i1, i1, 1
265+
movgr2fr.d $f21, i1
266+
blt $r0, I, .L22
267+
movfr2gr.d i0, $f20
268+
.align 3
269+
270+
.L999:
271+
move $r4, $r17
272+
jirl $r0, $r1, 0x0
273+
.align 3
274+
275+
EPILOGUE

0 commit comments

Comments
 (0)