Skip to content

Commit 174c257

Browse files
yanchengyinshiyou
authored andcommitted
loongarch64: Add optimizations for copy.
1 parent 49829b2 commit 174c257

File tree

6 files changed

+898
-0
lines changed

6 files changed

+898
-0
lines changed

kernel/loongarch64/KERNEL.LOONGSON2K1000

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,4 +31,7 @@ IDAMAXKERNEL = idamax_lsx.S
3131
ISAMINKERNEL = isamin_lsx.S
3232
IDAMINKERNEL = idamin_lsx.S
3333

34+
SCOPYKERNEL = scopy_lsx.S
35+
DCOPYKERNEL = dcopy_lsx.S
36+
3437
endif

kernel/loongarch64/KERNEL.LOONGSON3R5

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,9 @@ IDAMAXKERNEL = idamax_lasx.S
3131
ISAMINKERNEL = isamin_lasx.S
3232
IDAMINKERNEL = idamin_lasx.S
3333

34+
SCOPYKERNEL = scopy_lasx.S
35+
DCOPYKERNEL = dcopy_lasx.S
36+
3437
DGEMMKERNEL = dgemm_kernel_16x4.S
3538
DGEMMINCOPY = dgemm_ncopy_16.S
3639
DGEMMITCOPY = dgemm_tcopy_16.S

kernel/loongarch64/dcopy_lasx.S

Lines changed: 224 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,224 @@
1+
#define ASSEMBLER
2+
3+
#include "common.h"
4+
#define N $r4
5+
#define X $r5
6+
#define INCX $r6
7+
#define Y $r7
8+
#define INCY $r8
9+
#define I $r17
10+
#define TEMP $r18
11+
#define t1 $r14
12+
#define t2 $r15
13+
#define t3 $r16
14+
#define t4 $r19
15+
#define a1 $f12
16+
#define a2 $f13
17+
#define a3 $f14
18+
#define a4 $f15
19+
#define VX0 $xr12
20+
#define VX1 $xr13
21+
22+
PROLOGUE
23+
bge $r0, N, .L999
24+
li.d TEMP, 1
25+
slli.d TEMP, TEMP, BASE_SHIFT
26+
slli.d INCX, INCX, BASE_SHIFT
27+
slli.d INCY, INCY, BASE_SHIFT
28+
srai.d I, N, 3
29+
bne INCX, TEMP, .L20
30+
bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
31+
b .L11 // INCX==1 and INCY==1
32+
.L20:
33+
bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
34+
b .L21 // INCX!=1 and INCY==1
35+
36+
.L11:
37+
bge $r0, I, .L112
38+
.align 3
39+
40+
.L111:
41+
xvld VX0, X, 0 * SIZE
42+
xvld VX1, X, 4 * SIZE
43+
xvst VX0, Y, 0 * SIZE
44+
xvst VX1, Y, 4 * SIZE
45+
addi.d X, X, 8 * SIZE
46+
addi.d Y, Y, 8 * SIZE
47+
addi.d I, I, -1
48+
blt $r0, I, .L111
49+
.align 3
50+
51+
.L112:
52+
andi I, N, 7
53+
bge $r0, I, .L999
54+
.align 3
55+
56+
.L113:
57+
fld.d $f12, X, 0 * SIZE
58+
addi.d I, I, -1
59+
addi.d X, X, SIZE
60+
fst.d $f12, Y, 0 * SIZE
61+
addi.d Y, Y, SIZE
62+
blt $r0, I, .L113
63+
b .L999
64+
.align 3
65+
66+
.L12:
67+
bge $r0, I, .L122
68+
.align 3
69+
70+
.L121:
71+
xvld VX0, X, 0 * SIZE
72+
xvld VX1, X, 4 * SIZE
73+
xvstelm.d VX0, Y, 0, 0
74+
add.d Y, Y, INCY
75+
xvstelm.d VX0, Y, 0, 1
76+
add.d Y, Y, INCY
77+
xvstelm.d VX0, Y, 0, 2
78+
add.d Y, Y, INCY
79+
xvstelm.d VX0, Y, 0, 3
80+
add.d Y, Y, INCY
81+
xvstelm.d VX1, Y, 0, 0
82+
add.d Y, Y, INCY
83+
xvstelm.d VX1, Y, 0, 1
84+
add.d Y, Y, INCY
85+
xvstelm.d VX1, Y, 0, 2
86+
add.d Y, Y, INCY
87+
xvstelm.d VX1, Y, 0, 3
88+
add.d Y, Y, INCY
89+
addi.d X, X, 8 * SIZE
90+
addi.d I, I, -1
91+
blt $r0, I, .L121
92+
.align 3
93+
94+
.L122:
95+
andi I, N, 7
96+
bge $r0, I, .L999
97+
.align 3
98+
99+
.L123:
100+
fld.d $f12, X, 0 * SIZE
101+
addi.d I, I, -1
102+
addi.d X, X, SIZE
103+
fst.d $f12, Y, 0 * SIZE
104+
add.d Y, Y, INCY
105+
blt $r0, I, .L123
106+
b .L999
107+
.align 3
108+
109+
.L21:
110+
bge $r0, I, .L212
111+
.align 3
112+
113+
.L211:
114+
ld.d t1, X, 0 * SIZE
115+
add.d X, X, INCX
116+
ld.d t2, X, 0 * SIZE
117+
add.d X, X, INCX
118+
ld.d t3, X, 0 * SIZE
119+
add.d X, X, INCX
120+
ld.d t4, X, 0 * SIZE
121+
add.d X, X, INCX
122+
xvinsgr2vr.d VX0, t1, 0
123+
xvinsgr2vr.d VX0, t2, 1
124+
xvinsgr2vr.d VX0, t3, 2
125+
xvinsgr2vr.d VX0, t4, 3
126+
xvst VX0, Y, 0 * SIZE
127+
ld.d t1, X, 0 * SIZE
128+
add.d X, X, INCX
129+
ld.d t2, X, 0 * SIZE
130+
add.d X, X, INCX
131+
ld.d t3, X, 0 * SIZE
132+
add.d X, X, INCX
133+
ld.d t4, X, 0 * SIZE
134+
add.d X, X, INCX
135+
xvinsgr2vr.d VX1, t1, 0
136+
xvinsgr2vr.d VX1, t2, 1
137+
xvinsgr2vr.d VX1, t3, 2
138+
xvinsgr2vr.d VX1, t4, 3
139+
xvst VX1, Y, 4 * SIZE
140+
addi.d Y, Y, 8 * SIZE
141+
addi.d I, I, -1
142+
blt $r0, I, .L211
143+
.align 3
144+
145+
.L212:
146+
andi I, N, 7
147+
bge $r0, I, .L999
148+
.align 3
149+
150+
.L213:
151+
fld.d $f12, X, 0 * SIZE
152+
addi.d I, I, -1
153+
fst.d $f12, Y, 0 * SIZE
154+
add.d X, X, INCX
155+
addi.d Y, Y, SIZE
156+
blt $r0, I, .L213
157+
b .L999
158+
.align 3
159+
160+
.L22:
161+
bgez INCX, .L220
162+
.align 3
163+
164+
.L220:
165+
bge $r0, I, .L223
166+
.align 3
167+
168+
.L222:
169+
fld.d a1, X, 0 * SIZE
170+
add.d X, X, INCX
171+
fld.d a2, X, 0 * SIZE
172+
add.d X, X, INCX
173+
fld.d a3, X, 0 * SIZE
174+
add.d X, X, INCX
175+
fld.d a4, X, 0 * SIZE
176+
add.d X, X, INCX
177+
fst.d a1, Y, 0 * SIZE
178+
add.d Y, Y, INCY
179+
fst.d a2, Y, 0 * SIZE
180+
add.d Y, Y, INCY
181+
fst.d a3, X, 0 * SIZE
182+
add.d Y, Y, INCY
183+
fst.d a4, X, 0 * SIZE
184+
add.d Y, Y, INCY
185+
fld.d a1, X, 0 * SIZE
186+
add.d X, X, INCX
187+
fld.d a2, X, 0 * SIZE
188+
add.d X, X, INCX
189+
fld.d a3, X, 0 * SIZE
190+
add.d X, X, INCX
191+
fld.d a4, X, 0 * SIZE
192+
add.d X, X, INCX
193+
fst.d a1, Y, 0 * SIZE
194+
add.d Y, Y, INCY
195+
fst.d a2, Y, 0 * SIZE
196+
add.d Y, Y, INCY
197+
fst.d a3, X, 0 * SIZE
198+
add.d Y, Y, INCY
199+
fst.d a4, X, 0 * SIZE
200+
add.d Y, Y, INCY
201+
addi.d I, I, -1
202+
blt $r0, I, .L222
203+
.align 3
204+
205+
.L223:
206+
andi I, N, 7
207+
bge $r0, I, .L999
208+
.align 3
209+
210+
.L224:
211+
fld.d $f12, X, 0 * SIZE
212+
addi.d I, I, -1
213+
fst.d $f12, Y, 0 * SIZE
214+
add.d X, X, INCX
215+
add.d Y, Y, INCY
216+
blt $r0, I, .L224
217+
.align 3
218+
219+
.L999:
220+
move $r4, $r12
221+
jirl $r0, $r1, 0x0
222+
.align 3
223+
224+
EPILOGUE

0 commit comments

Comments
 (0)