Skip to content

Commit 3193aa9

Browse files
authored
Merge pull request #4362 from yinshiyou/la-dev
Add 15 level1 optimizations for LoongArch.
2 parents a8cb611 + d32f38f commit 3193aa9

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

70 files changed

+19998
-0
lines changed

kernel/loongarch64/KERNEL.LOONGSON2K1000

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,55 @@ SDOTKERNEL = dot_lsx.S
44
DSDOTKERNEL = dot_lsx.S
55
DDOTKERNEL = dot_lsx.S
66

7+
SSCALKERNEL = sscal_lsx.S
8+
DSCALKERNEL = dscal_lsx.S
9+
10+
SAMAXKERNEL = samax_lsx.S
11+
DAMAXKERNEL = damax_lsx.S
12+
13+
SAMINKERNEL = samin_lsx.S
14+
DAMINKERNEL = damin_lsx.S
15+
16+
SMAXKERNEL = smax_lsx.S
17+
DMAXKERNEL = dmax_lsx.S
18+
19+
SMINKERNEL = smin_lsx.S
20+
DMINKERNEL = dmin_lsx.S
21+
22+
ISMAXKERNEL = ismax_lsx.S
23+
IDMAXKERNEL = idmax_lsx.S
24+
25+
ISMINKERNEL = ismin_lsx.S
26+
IDMINKERNEL = idmin_lsx.S
27+
28+
ISAMAXKERNEL = isamax_lsx.S
29+
IDAMAXKERNEL = idamax_lsx.S
30+
31+
ISAMINKERNEL = isamin_lsx.S
32+
IDAMINKERNEL = idamin_lsx.S
33+
34+
SCOPYKERNEL = scopy_lsx.S
35+
DCOPYKERNEL = dcopy_lsx.S
36+
37+
SSWAPKERNEL = sswap_lsx.S
38+
DSWAPKERNEL = dswap_lsx.S
39+
40+
SAXPYKERNEL = saxpy_lsx.S
41+
DAXPYKERNEL = daxpy_lsx.S
42+
43+
SAXPBYKERNEL = saxpby_lsx.S
44+
DAXPBYKERNEL = daxpby_lsx.S
45+
46+
SSUMKERNEL = ssum_lsx.S
47+
DSUMKERNEL = dsum_lsx.S
48+
49+
SASUMKERNEL = sasum_lsx.S
50+
DASUMKERNEL = dasum_lsx.S
51+
52+
SROTKERNEL = srot_lsx.S
53+
DROTKERNEL = drot_lsx.S
54+
55+
SNRM2KERNEL = snrm2_lsx.S
56+
DNRM2KERNEL = dnrm2_lsx.S
57+
758
endif

kernel/loongarch64/KERNEL.LOONGSON3R5

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,57 @@ SDOTKERNEL = dot_lasx.S
44
DSDOTKERNEL = dot_lasx.S
55
DDOTKERNEL = dot_lasx.S
66

7+
SSCALKERNEL = sscal_lasx.S
8+
DSCALKERNEL = dscal_lasx.S
9+
10+
SAMAXKERNEL = samax_lasx.S
11+
DAMAXKERNEL = damax_lasx.S
12+
13+
SAMINKERNEL = samin_lasx.S
14+
DAMINKERNEL = damin_lasx.S
15+
16+
SMAXKERNEL = smax_lasx.S
17+
DMAXKERNEL = dmax_lasx.S
18+
19+
SMINKERNEL = smin_lasx.S
20+
DMINKERNEL = dmin_lasx.S
21+
22+
ISMAXKERNEL = ismax_lasx.S
23+
IDMAXKERNEL = idmax_lasx.S
24+
25+
ISMINKERNEL = ismin_lasx.S
26+
IDMINKERNEL = idmin_lasx.S
27+
28+
ISAMAXKERNEL = isamax_lasx.S
29+
IDAMAXKERNEL = idamax_lasx.S
30+
31+
ISAMINKERNEL = isamin_lasx.S
32+
IDAMINKERNEL = idamin_lasx.S
33+
34+
SCOPYKERNEL = scopy_lasx.S
35+
DCOPYKERNEL = dcopy_lasx.S
36+
37+
SSWAPKERNEL = sswap_lasx.S
38+
DSWAPKERNEL = dswap_lasx.S
39+
40+
SAXPYKERNEL = saxpy_lasx.S
41+
DAXPYKERNEL = daxpy_lasx.S
42+
43+
SAXPBYKERNEL = saxpby_lasx.S
44+
DAXPBYKERNEL = daxpby_lasx.S
45+
46+
SSUMKERNEL = ssum_lasx.S
47+
DSUMKERNEL = dsum_lasx.S
48+
49+
SASUMKERNEL = sasum_lasx.S
50+
DASUMKERNEL = dasum_lasx.S
51+
52+
SROTKERNEL = srot_lasx.S
53+
DROTKERNEL = drot_lasx.S
54+
55+
SNRM2KERNEL = snrm2_lasx.S
56+
DNRM2KERNEL = dnrm2_lasx.S
57+
758
DGEMMKERNEL = dgemm_kernel_16x4.S
859
DGEMMINCOPY = dgemm_ncopy_16.S
960
DGEMMITCOPY = dgemm_tcopy_16.S

kernel/loongarch64/damax_lasx.S

Lines changed: 183 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,183 @@
1+
#define ASSEMBLER
2+
3+
#include "common.h"
4+
5+
#define N $r4
6+
#define X $r5
7+
#define INCX $r6
8+
#define I $r12
9+
#define J $r13
10+
#define t1 $r14
11+
#define t2 $r18
12+
#define t3 $r15
13+
#define t4 $r17
14+
#define TEMP $r16
15+
#define m0 $xr8
16+
#define x1 $xr9
17+
#define x2 $xr10
18+
#define x3 $xr11
19+
#define x4 $xr12
20+
#define x5 $xr13
21+
#define x6 $xr14
22+
#define x7 $xr15
23+
#define x8 $xr16
24+
#define VX0 $xr20
25+
#define VX1 $xr21
26+
#define VM0 $xr22
27+
#define VM1 $xr23
28+
#define VM2 $xr18
29+
#define VM3 $xr19
30+
31+
PROLOGUE
32+
33+
bge $r0, N, .L999
34+
bge $r0, INCX, .L999
35+
li.d TEMP, 1
36+
slli.d TEMP, TEMP, BASE_SHIFT
37+
slli.d INCX, INCX, BASE_SHIFT
38+
bne INCX, TEMP, .L20
39+
xvld VM0, X, 0
40+
srai.d I, N, 3
41+
bge $r0, I, .L12
42+
.align 3
43+
44+
.L10:
45+
xvld VX0, X, 0 * SIZE
46+
xvld VX1, X, 4 * SIZE
47+
addi.d I, I, -1
48+
xvfmaxa.d VM1, VX1, VX0
49+
addi.d X, X, 8 * SIZE
50+
xvfmaxa.d VM0, VM0, VM1
51+
blt $r0, I, .L10
52+
.align 3
53+
54+
.L11:
55+
xvpickve.d x1, VM0, 0
56+
xvpickve.d x2, VM0, 1
57+
xvpickve.d x3, VM0, 2
58+
xvpickve.d x4, VM0, 3
59+
xvfmaxa.d VM1, x1, x2
60+
xvfmaxa.d VM2, x3, x4
61+
xvfmaxa.d VM0, VM1, VM2
62+
.align 3
63+
64+
.L12: //INCX==1 and N<8
65+
andi I, N, 7
66+
li.d J, 4
67+
bge J, I, .L13 // 4<N<8
68+
xvld VX0, X, 0
69+
slli.d J, J, 1 // 8
70+
sub.d I, J, I
71+
slli.d I, I, BASE_SHIFT
72+
xvldx VX1, X, I
73+
xvfmaxa.d m0, VX0, VX1 //patial repeat read
74+
xvpickve.d x1, m0, 0
75+
xvpickve.d x2, m0, 1
76+
xvpickve.d x3, m0, 2
77+
xvpickve.d x4, m0, 3
78+
xvfmaxa.d VM1, x1, x2
79+
xvfmaxa.d m0, x3, x4
80+
xvfmaxa.d m0, m0, VM1
81+
xvfmaxa.d VM0, m0, VM0
82+
fabs.d $f22, $f22
83+
fmov.d $f0, $f22
84+
jirl $r0, $r1, 0x0
85+
.align 3
86+
87+
.L13: //INCX==1 and 0<=N<=4
88+
bge $r0, I, .L15
89+
.align 3
90+
91+
.L14:
92+
xvld x1, X, 0
93+
addi.d I, I, -1
94+
xvfmaxa.d VM0, VM0, x1
95+
addi.d X, X, SIZE
96+
blt $r0, I, .L14
97+
.align 3
98+
99+
.L15:
100+
fabs.d $f22, $f22
101+
fmov.d $f0, $f22
102+
jirl $r0, $r1, 0x0
103+
.align 3
104+
105+
.L20: // INCX!=1
106+
move TEMP, X // initialize the maxa value
107+
ld.d t1, TEMP, 0 * SIZE
108+
add.d TEMP, TEMP, INCX
109+
xvinsgr2vr.d VM0, t1, 0
110+
srai.d I, N, 3
111+
bge $r0, I, .L23
112+
ld.d t2, TEMP, 0 * SIZE
113+
add.d TEMP, TEMP, INCX
114+
ld.d t3, TEMP, 0 * SIZE
115+
add.d TEMP, TEMP, INCX
116+
ld.d t4, TEMP, 0 * SIZE
117+
add.d TEMP, TEMP, INCX
118+
xvinsgr2vr.d VM0, t2, 1
119+
xvinsgr2vr.d VM0, t3, 2
120+
xvinsgr2vr.d VM0, t4, 3
121+
.align 3
122+
123+
.L21:
124+
ld.d t1, X, 0 * SIZE
125+
add.d X, X, INCX
126+
ld.d t2, X, 0 * SIZE
127+
add.d X, X, INCX
128+
ld.d t3, X, 0 * SIZE
129+
add.d X, X, INCX
130+
ld.d t4, X, 0 * SIZE
131+
add.d X, X, INCX
132+
xvinsgr2vr.d VX0, t1, 0
133+
xvinsgr2vr.d VX0, t2, 1
134+
xvinsgr2vr.d VX0, t3, 2
135+
xvinsgr2vr.d VX0, t4, 3
136+
ld.d t1, X, 0 * SIZE
137+
add.d X, X, INCX
138+
ld.d t2, X, 0 * SIZE
139+
add.d X, X, INCX
140+
ld.d t3, X, 0 * SIZE
141+
add.d X, X, INCX
142+
ld.d t4, X, 0 * SIZE
143+
add.d X, X, INCX
144+
xvinsgr2vr.d VX1, t1, 0
145+
xvinsgr2vr.d VX1, t2, 1
146+
xvinsgr2vr.d VX1, t3, 2
147+
xvinsgr2vr.d VX1, t4, 3
148+
addi.d I, I, -1
149+
xvfmaxa.d VM1, VX1, VX0
150+
xvfmaxa.d VM0, VM1, VM0
151+
blt $r0, I, .L21
152+
.align 3
153+
154+
.L22:
155+
xvpickve.d x1, VM0, 0
156+
xvpickve.d x2, VM0, 1
157+
xvpickve.d x3, VM0, 2
158+
xvpickve.d x4, VM0, 3
159+
xvfmaxa.d VM1, x1, x2
160+
xvfmaxa.d VM2, x3, x4
161+
xvfmaxa.d VM0, VM1, VM2
162+
.align 3
163+
164+
.L23: //INCX!=1 and N<8
165+
andi I, N, 7
166+
bge $r0, I, .L999
167+
.align 3
168+
169+
.L24:
170+
xvld x1, X, 0
171+
addi.d I, I, -1
172+
xvfmaxa.d VM0, VM0, x1
173+
add.d X, X, INCX
174+
blt $r0, I, .L24
175+
.align 3
176+
177+
.L999:
178+
fabs.d $f22, $f22
179+
fmov.d $f0, $f22
180+
jirl $r0, $r1, 0x0
181+
.align 3
182+
183+
EPILOGUE

0 commit comments

Comments
 (0)