Skip to content

Commit 360acc0

Browse files
yanchengyinshiyou
authored andcommitted
loongarch64: Add optimizations for swap.
1 parent 174c257 commit 360acc0

File tree

6 files changed

+1204
-0
lines changed

6 files changed

+1204
-0
lines changed

kernel/loongarch64/KERNEL.LOONGSON2K1000

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,4 +34,7 @@ IDAMINKERNEL = idamin_lsx.S
3434
SCOPYKERNEL = scopy_lsx.S
3535
DCOPYKERNEL = dcopy_lsx.S
3636

37+
SSWAPKERNEL = sswap_lsx.S
38+
DSWAPKERNEL = dswap_lsx.S
39+
3740
endif

kernel/loongarch64/KERNEL.LOONGSON3R5

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,9 @@ IDAMINKERNEL = idamin_lasx.S
3434
SCOPYKERNEL = scopy_lasx.S
3535
DCOPYKERNEL = dcopy_lasx.S
3636

37+
SSWAPKERNEL = sswap_lasx.S
38+
DSWAPKERNEL = dswap_lasx.S
39+
3740
DGEMMKERNEL = dgemm_kernel_16x4.S
3841
DGEMMINCOPY = dgemm_ncopy_16.S
3942
DGEMMITCOPY = dgemm_tcopy_16.S

kernel/loongarch64/dswap_lasx.S

Lines changed: 301 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,301 @@
1+
#define ASSEMBLER
2+
3+
#include "common.h"
4+
#define N $r4
5+
#define X $r7
6+
#define INCX $r8
7+
#define Y $r9
8+
#define INCY $r10
9+
10+
#define I $r17
11+
#define TEMP $r18
12+
#define XX $r5
13+
#define YY $r6
14+
#define t1 $r14
15+
#define t2 $r15
16+
#define t3 $r16
17+
#define t4 $r19
18+
#define a1 $f12
19+
#define a2 $f13
20+
#define a3 $f14
21+
#define a4 $f15
22+
#define b1 $f16
23+
#define b2 $f17
24+
#define b3 $f18
25+
#define b4 $f19
26+
#define VX0 $xr12
27+
#define VX1 $xr13
28+
#define VX2 $xr14
29+
#define VX3 $xr15
30+
31+
32+
PROLOGUE
33+
bge $r0, N, .L999
34+
li.d TEMP, 1
35+
slli.d TEMP, TEMP, BASE_SHIFT
36+
slli.d INCX, INCX, BASE_SHIFT
37+
slli.d INCY, INCY, BASE_SHIFT
38+
srai.d I, N, 3
39+
bne INCX, TEMP, .L20
40+
bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
41+
b .L11 // INCX==1 and INCY==1
42+
.L20:
43+
bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
44+
b .L21 // INCX!=1 and INCY==1
45+
46+
.L11:
47+
bge $r0, I, .L112
48+
.align 3
49+
50+
.L111:
51+
xvld VX0, X, 0 * SIZE
52+
xvld VX1, X, 4 * SIZE
53+
xvld VX2, Y, 0 * SIZE
54+
xvld VX3, Y, 4 * SIZE
55+
addi.d I, I, -1
56+
xvst VX2, X, 0 * SIZE
57+
xvst VX3, X, 4 * SIZE
58+
xvst VX0, Y, 0 * SIZE
59+
xvst VX1, Y, 4 * SIZE
60+
addi.d X, X, 8 * SIZE
61+
addi.d Y, Y, 8 * SIZE
62+
blt $r0, I, .L111
63+
.align 3
64+
65+
.L112:
66+
andi I, N, 7
67+
bge $r0, I, .L999
68+
.align 3
69+
70+
.L113:
71+
fld.d $f12, X, 0 * SIZE
72+
fld.d $f14, Y, 0 * SIZE
73+
addi.d I, I, -1
74+
fst.d $f12, Y, 0 * SIZE
75+
fst.d $f14, X, 0 * SIZE
76+
addi.d X, X, SIZE
77+
addi.d Y, Y, SIZE
78+
blt $r0, I, .L113
79+
b .L999
80+
.align 3
81+
82+
.L12: // INCX==1 and INCY!=1
83+
bge $r0, I, .L122
84+
.align 3
85+
86+
.L121:
87+
xvld VX0, X, 0 * SIZE
88+
ld.d t1, Y, 0 * SIZE
89+
xvstelm.d VX0, Y, 0, 0
90+
add.d Y, Y, INCY
91+
ld.d t2, Y, 0 * SIZE
92+
xvstelm.d VX0, Y, 0, 1
93+
add.d Y, Y, INCY
94+
ld.d t3, Y, 0 * SIZE
95+
xvstelm.d VX0, Y, 0, 2
96+
add.d Y, Y, INCY
97+
ld.d t4, Y, 0 * SIZE
98+
xvstelm.d VX0, Y, 0, 3
99+
xvinsgr2vr.d VX2, t1, 0
100+
xvinsgr2vr.d VX2, t2, 1
101+
xvinsgr2vr.d VX2, t3, 2
102+
xvinsgr2vr.d VX2, t4, 3
103+
add.d Y, Y, INCY
104+
xvst VX2, X, 0 * SIZE
105+
xvld VX1, X, 4 * SIZE
106+
ld.d t1, Y, 0 * SIZE
107+
xvstelm.d VX1, Y, 0, 0
108+
add.d Y, Y, INCY
109+
ld.d t2, Y, 0 * SIZE
110+
xvstelm.d VX1, Y, 0, 1
111+
add.d Y, Y, INCY
112+
ld.d t3, Y, 0 * SIZE
113+
xvstelm.d VX1, Y, 0, 2
114+
add.d Y, Y, INCY
115+
ld.d t4, Y, 0 * SIZE
116+
xvstelm.d VX1, Y, 0, 3
117+
xvinsgr2vr.d VX3, t1, 0
118+
xvinsgr2vr.d VX3, t2, 1
119+
xvinsgr2vr.d VX3, t3, 2
120+
xvinsgr2vr.d VX3, t4, 3
121+
add.d Y, Y, INCY
122+
xvst VX3, X, 4 * SIZE
123+
addi.d X, X, 8 * SIZE
124+
addi.d I, I, -1
125+
blt $r0, I, .L121
126+
.align 3
127+
128+
.L122:
129+
andi I, N, 7
130+
bge $r0, I, .L999
131+
.align 3
132+
133+
.L123:
134+
fld.d $f12, X, 0 * SIZE
135+
fld.d $f14, Y, 0 * SIZE
136+
addi.d I, I, -1
137+
fst.d $f12, Y, 0 * SIZE
138+
fst.d $f14, X, 0 * SIZE
139+
addi.d X, X, SIZE
140+
add.d Y, Y, INCY
141+
blt $r0, I, .L123
142+
b .L999
143+
.align 3
144+
145+
.L21:
146+
bge $r0, I, .L212
147+
.align 3
148+
149+
.L211:
150+
xvld VX2, Y, 0 * SIZE
151+
ld.d t1, X, 0 * SIZE
152+
xvstelm.d VX2, X, 0, 0
153+
add.d X, X, INCX
154+
ld.d t2, X, 0 * SIZE
155+
xvstelm.d VX2, X, 0, 1
156+
add.d X, X, INCX
157+
ld.d t3, X, 0 * SIZE
158+
xvstelm.d VX2, X, 0, 2
159+
add.d X, X, INCX
160+
ld.d t4, X, 0 * SIZE
161+
xvstelm.d VX2, X, 0, 3
162+
xvinsgr2vr.d VX0, t1, 0
163+
xvinsgr2vr.d VX0, t2, 1
164+
xvinsgr2vr.d VX0, t3, 2
165+
xvinsgr2vr.d VX0, t4, 3
166+
add.d X, X, INCX
167+
xvst VX0, Y, 0 * SIZE
168+
xvld VX3, Y, 4 * SIZE
169+
ld.d t1, X, 0 * SIZE
170+
xvstelm.d VX3, X, 0, 0
171+
add.d X, X, INCY
172+
ld.d t2, X, 0 * SIZE
173+
xvstelm.d VX3, X, 0, 1
174+
add.d X, X, INCX
175+
ld.d t3, X, 0 * SIZE
176+
xvstelm.d VX3, X, 0, 2
177+
add.d X, X, INCX
178+
ld.d t4, X, 0 * SIZE
179+
xvstelm.d VX3, X, 0, 3
180+
xvinsgr2vr.d VX1, t1, 0
181+
xvinsgr2vr.d VX1, t2, 1
182+
xvinsgr2vr.d VX1, t3, 2
183+
xvinsgr2vr.d VX1, t4, 3
184+
add.d X, X, INCX
185+
xvst VX1, Y, 0 * SIZE
186+
addi.d Y, Y, 8 * SIZE
187+
addi.d I, I, -1
188+
blt $r0, I, .L211
189+
.align 3
190+
191+
.L212:
192+
andi I, N, 7
193+
bge $r0, I, .L999
194+
.align 3
195+
196+
.L213:
197+
fld.d $f12, X, 0 * SIZE
198+
fld.d $f14, Y, 0 * SIZE
199+
addi.d I, I, -1
200+
fst.d $f12, Y, 0 * SIZE
201+
fst.d $f14, X, 0 * SIZE
202+
add.d X, X, INCX
203+
addi.d Y, Y, SIZE
204+
blt $r0, I, .L213
205+
b .L999
206+
.align 3
207+
208+
.L22:
209+
bgez INCX, .L220
210+
//addi.d TEMP, N, -1
211+
//mul.d TEMP, TEMP, INCX
212+
//sub.d X, X, TEMP
213+
.align 3
214+
215+
.L220:
216+
bge $r0, I, .L223
217+
.align 3
218+
move XX, X
219+
220+
.L222:
221+
fld.d a1, X, 0 * SIZE
222+
add.d X, X, INCX
223+
fld.d a2, X, 0 * SIZE
224+
add.d X, X, INCX
225+
fld.d a3, X, 0 * SIZE
226+
add.d X, X, INCX
227+
fld.d a4, X, 0 * SIZE
228+
add.d X, X, INCX
229+
fld.d b1, Y, 0 * SIZE
230+
fst.d a1, Y, 0 * SIZE
231+
add.d Y, Y, INCY
232+
fld.d b2, Y, 0 * SIZE
233+
fst.d a2, Y, 0 * SIZE
234+
add.d Y, Y, INCY
235+
fld.d b3, Y, 0 * SIZE
236+
fst.d a3, Y, 0 * SIZE
237+
add.d Y, Y, INCY
238+
fld.d b4, Y, 0 * SIZE
239+
fst.d a4, Y, 0 * SIZE
240+
add.d Y, Y, INCY
241+
fld.d a1, X, 0 * SIZE
242+
add.d X, X, INCX
243+
fst.d b1, XX, 0 * SIZE
244+
add.d XX, XX, INCX
245+
fld.d b1, Y, 0 * SIZE
246+
fst.d a1, Y, 0 * SIZE
247+
add.d Y, Y, INCY
248+
fld.d a2, X, 0 * SIZE
249+
add.d X, X, INCX
250+
fst.d b2, XX, 0 * SIZE
251+
add.d XX, XX, INCX
252+
fld.d b2, Y, 0 * SIZE
253+
fst.d a2, Y, 0 * SIZE
254+
add.d Y, Y, INCY
255+
fld.d a3, X, 0 * SIZE
256+
add.d X, X, INCX
257+
fst.d b3, XX, 0 * SIZE
258+
add.d XX, XX, INCX
259+
fld.d b3, Y, 0 * SIZE
260+
fst.d a3, Y, 0 * SIZE
261+
fld.d a4, X, 0 * SIZE
262+
add.d X, X, INCX
263+
fst.d b4, XX, 0 * SIZE
264+
add.d XX, XX, INCX
265+
fld.d b4, Y, 0 * SIZE
266+
fst.d a4, Y, 0 * SIZE
267+
add.d Y, Y, INCY
268+
fst.d b1, XX, 0 * SIZE
269+
add.d XX, XX, INCX
270+
fst.d b2, XX, 0 * SIZE
271+
add.d XX, XX, INCX
272+
fst.d b3, XX, 0 * SIZE
273+
add.d XX, XX, INCX
274+
fst.d b4, XX, 0 * SIZE
275+
add.d XX, XX, INCX
276+
addi.d I, I, -1
277+
blt $r0, I, .L222
278+
.align 3
279+
280+
.L223:
281+
andi I, N, 7
282+
bge $r0, I, .L999
283+
.align 3
284+
285+
.L224:
286+
fld.d $f12, X, 0 * SIZE
287+
fld.d $f14, Y, 0 * SIZE
288+
addi.d I, I, -1
289+
fst.d $f12, Y, 0 * SIZE
290+
fst.d $f14, X, 0 * SIZE
291+
add.d X, X, INCX
292+
add.d Y, Y, INCY
293+
blt $r0, I, .L224
294+
.align 3
295+
296+
.L999:
297+
move $r4, $r12
298+
jirl $r0, $r1, 0x0
299+
.align 3
300+
301+
EPILOGUE

0 commit comments

Comments
 (0)