Skip to content

Commit 346b384

Browse files
yanchengyinshiyou
authored andcommitted
loongarch64: Add optimization for max.
1 parent ff2ecc6 commit 346b384

File tree

6 files changed

+698
-0
lines changed

6 files changed

+698
-0
lines changed

kernel/loongarch64/KERNEL.LOONGSON2K1000

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,4 +13,7 @@ DAMAXKERNEL = damax_lsx.S
1313
SAMINKERNEL = samin_lsx.S
1414
DAMINKERNEL = damin_lsx.S
1515

16+
SMAXKERNEL = smax_lsx.S
17+
DMAXKERNEL = dmax_lsx.S
18+
1619
endif

kernel/loongarch64/KERNEL.LOONGSON3R5

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,9 @@ DAMAXKERNEL = damax_lasx.S
1313
SAMINKERNEL = samin_lasx.S
1414
DAMINKERNEL = damin_lasx.S
1515

16+
SMAXKERNEL = smax_lasx.S
17+
DMAXKERNEL = dmax_lasx.S
18+
1619
DGEMMKERNEL = dgemm_kernel_16x4.S
1720
DGEMMINCOPY = dgemm_ncopy_16.S
1821
DGEMMITCOPY = dgemm_tcopy_16.S

kernel/loongarch64/dmax_lasx.S

Lines changed: 175 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,175 @@
1+
#define ASSEMBLER
2+
3+
#include "common.h"
4+
5+
#define N $r4
6+
#define X $r5
7+
#define INCX $r6
8+
#define I $r12
9+
#define J $r13
10+
#define t1 $r14
11+
#define t2 $r18
12+
#define t3 $r15
13+
#define t4 $r17
14+
#define TEMP $r16
15+
#define m0 $xr8
16+
#define x1 $xr9
17+
#define x2 $xr10
18+
#define x3 $xr11
19+
#define x4 $xr12
20+
#define VX0 $xr20
21+
#define VX1 $xr21
22+
#define VM0 $xr22
23+
#define VM1 $xr23
24+
#define VM2 $xr19
25+
26+
PROLOGUE
27+
28+
bge $r0, N, .L999
29+
bge $r0, INCX, .L999
30+
li.d TEMP, 1
31+
slli.d TEMP, TEMP, BASE_SHIFT
32+
slli.d INCX, INCX, BASE_SHIFT
33+
bne INCX, TEMP, .L20
34+
xvld VM0, X, 0
35+
srai.d I, N, 3
36+
bge $r0, I, .L12
37+
.align 3
38+
39+
.L10:
40+
xvld VX0, X, 0 * SIZE
41+
xvld VX1, X, 4 * SIZE
42+
addi.d I, I, -1
43+
xvfmax.d VM1, VX1, VX0
44+
addi.d X, X, 8 * SIZE
45+
xvfmax.d VM0, VM0, VM1
46+
blt $r0, I, .L10
47+
.align 3
48+
49+
.L11:
50+
xvpickve.d x1, VM0, 0
51+
xvpickve.d x2, VM0, 1
52+
xvpickve.d x3, VM0, 2
53+
xvpickve.d x4, VM0, 3
54+
xvfmax.d VM1, x1, x2
55+
xvfmax.d VM2, x3, x4
56+
xvfmax.d VM0, VM1, VM2
57+
.align 3
58+
59+
.L12: //INCX==1 and N<8
60+
andi I, N, 7
61+
li.d J, 4
62+
bge J, I, .L13 // 4<N<8
63+
xvld VX0, X, 0
64+
slli.d J, J, 1 // 8
65+
sub.d I, J, I
66+
slli.d I, I, BASE_SHIFT
67+
xvldx VX1, X, I
68+
xvfmax.d m0, VX0, VX1 //patial repeat read
69+
xvpickve.d x1, m0, 0
70+
xvpickve.d x2, m0, 1
71+
xvpickve.d x3, m0, 2
72+
xvpickve.d x4, m0, 3
73+
xvfmax.d VM1, x1, x2
74+
xvfmax.d m0, x3, x4
75+
xvfmax.d m0, m0, VM1
76+
xvfmax.d VM0, m0, VM0
77+
fmov.d $f0, $f22
78+
jirl $r0, $r1, 0x0
79+
.align 3
80+
81+
.L13: //INCX==1 and 0<=N<=4
82+
bge $r0, I, .L15
83+
.align 3
84+
85+
.L14:
86+
xvld x1, X, 0
87+
addi.d I, I, -1
88+
xvfmax.d VM0, VM0, x1
89+
addi.d X, X, SIZE
90+
blt $r0, I, .L14
91+
.align 3
92+
93+
.L15:
94+
fmov.d $f0, $f22
95+
jirl $r0, $r1, 0x0
96+
.align 3
97+
98+
.L20: // INCX!=1
99+
move TEMP, X // initialize the max value
100+
ld.d t1, TEMP, 0 * SIZE
101+
add.d TEMP, TEMP, INCX
102+
xvinsgr2vr.d VM0, t1, 0
103+
srai.d I, N, 3
104+
bge $r0, I, .L23
105+
ld.d t2, TEMP, 0 * SIZE
106+
add.d TEMP, TEMP, INCX
107+
ld.d t3, TEMP, 0 * SIZE
108+
add.d TEMP, TEMP, INCX
109+
ld.d t4, TEMP, 0 * SIZE
110+
add.d TEMP, TEMP, INCX
111+
xvinsgr2vr.d VM0, t2, 1
112+
xvinsgr2vr.d VM0, t3, 2
113+
xvinsgr2vr.d VM0, t4, 3
114+
.align 3
115+
116+
.L21:
117+
ld.d t1, X, 0 * SIZE
118+
add.d X, X, INCX
119+
ld.d t2, X, 0 * SIZE
120+
add.d X, X, INCX
121+
ld.d t3, X, 0 * SIZE
122+
add.d X, X, INCX
123+
ld.d t4, X, 0 * SIZE
124+
add.d X, X, INCX
125+
xvinsgr2vr.d VX0, t1, 0
126+
xvinsgr2vr.d VX0, t2, 1
127+
xvinsgr2vr.d VX0, t3, 2
128+
xvinsgr2vr.d VX0, t4, 3
129+
ld.d t1, X, 0 * SIZE
130+
add.d X, X, INCX
131+
ld.d t2, X, 0 * SIZE
132+
add.d X, X, INCX
133+
ld.d t3, X, 0 * SIZE
134+
add.d X, X, INCX
135+
ld.d t4, X, 0 * SIZE
136+
add.d X, X, INCX
137+
xvinsgr2vr.d VX1, t1, 0
138+
xvinsgr2vr.d VX1, t2, 1
139+
xvinsgr2vr.d VX1, t3, 2
140+
xvinsgr2vr.d VX1, t4, 3
141+
addi.d I, I, -1
142+
xvfmax.d VM1, VX1, VX0
143+
xvfmax.d VM0, VM1, VM0
144+
blt $r0, I, .L21
145+
.align 3
146+
147+
.L22:
148+
xvpickve.d x1, VM0, 0
149+
xvpickve.d x2, VM0, 1
150+
xvpickve.d x3, VM0, 2
151+
xvpickve.d x4, VM0, 3
152+
xvfmax.d VM1, x1, x2
153+
xvfmax.d VM2, x3, x4
154+
xvfmax.d VM0, VM1, VM2
155+
.align 3
156+
157+
.L23: //INCX!=1 and N<8
158+
andi I, N, 7
159+
bge $r0, I, .L999
160+
.align 3
161+
162+
.L24:
163+
xvld x1, X, 0
164+
addi.d I, I, -1
165+
xvfmax.d VM0, VM0, x1
166+
add.d X, X, INCX
167+
blt $r0, I, .L24
168+
.align 3
169+
170+
.L999:
171+
fmov.d $f0, $f22
172+
jirl $r0, $r1, 0x0
173+
.align 3
174+
175+
EPILOGUE

kernel/loongarch64/dmax_lsx.S

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
#define ASSEMBLER
2+
3+
#include "common.h"
4+
5+
#define N $r4
6+
#define X $r5
7+
#define INCX $r6
8+
#define I $r12
9+
#define t1 $r14
10+
#define t2 $r18
11+
#define t3 $r15
12+
#define t4 $r17
13+
#define TEMP $r16
14+
#define x1 $vr9
15+
#define x2 $vr10
16+
#define VX0 $vr20
17+
#define VX1 $vr21
18+
#define VM0 $vr22
19+
#define VM1 $vr23
20+
#define VM2 $vr19
21+
#define VM3 $vr18
22+
23+
PROLOGUE
24+
25+
bge $r0, N, .L999
26+
bge $r0, INCX, .L999
27+
li.d TEMP, 1
28+
slli.d TEMP, TEMP, BASE_SHIFT
29+
slli.d INCX, INCX, BASE_SHIFT
30+
bne INCX, TEMP, .L20
31+
vld VM0, X, 0
32+
srai.d I, N, 3
33+
bge $r0, I, .L12
34+
.align 3
35+
36+
.L10:
37+
vld VX0, X, 0 * SIZE
38+
vld VX1, X, 2 * SIZE
39+
addi.d I, I, -1
40+
vfmax.d VM1, VX1, VX0
41+
vld VX0, X, 4 * SIZE
42+
vld VX1, X, 6 * SIZE
43+
vfmax.d VM2, VX1, VX0
44+
vfmax.d VM3, VM1, VM2
45+
addi.d X, X, 8 * SIZE
46+
vfmax.d VM0, VM0, VM3
47+
blt $r0, I, .L10
48+
.align 3
49+
50+
.L11:
51+
vreplvei.d x2, VM0, 1
52+
vfmax.d VM0, VM0, x2
53+
.align 3
54+
55+
.L12: //INCX==1 and N<8
56+
andi I, N, 7
57+
bge $r0, I, .L14
58+
.align 3
59+
60+
.L13:
61+
vld x1, X, 0
62+
addi.d I, I, -1
63+
vfmax.d VM0, VM0, x1
64+
addi.d X, X, SIZE
65+
blt $r0, I, .L13
66+
.align 3
67+
68+
.L14:
69+
fmov.d $f0, $f22
70+
jirl $r0, $r1, 0x0
71+
.align 3
72+
73+
.L20: // INCX!=1
74+
move TEMP, X // initialize the max value
75+
ld.d t1, TEMP, 0 * SIZE
76+
add.d TEMP, TEMP, INCX
77+
vinsgr2vr.d VM0, t1, 0
78+
srai.d I, N, 3
79+
bge $r0, I, .L23
80+
ld.d t2, TEMP, 0 * SIZE
81+
add.d TEMP, TEMP, INCX
82+
vinsgr2vr.d VM0, t2, 1
83+
.align 3
84+
85+
.L21:
86+
ld.d t1, X, 0 * SIZE
87+
add.d X, X, INCX
88+
ld.d t2, X, 0 * SIZE
89+
add.d X, X, INCX
90+
vinsgr2vr.d VX0, t1, 0
91+
vinsgr2vr.d VX0, t2, 1
92+
ld.d t3, X, 0 * SIZE
93+
add.d X, X, INCX
94+
ld.d t4, X, 0 * SIZE
95+
add.d X, X, INCX
96+
vinsgr2vr.d VX1, t3, 0
97+
vinsgr2vr.d VX1, t4, 1
98+
vfmax.d VM1, VX0, VX1
99+
ld.d t1, X, 0 * SIZE
100+
add.d X, X, INCX
101+
ld.d t2, X, 0 * SIZE
102+
add.d X, X, INCX
103+
vinsgr2vr.d VX0, t1, 0
104+
vinsgr2vr.d VX0, t2, 1
105+
ld.d t3, X, 0 * SIZE
106+
add.d X, X, INCX
107+
ld.d t4, X, 0 * SIZE
108+
add.d X, X, INCX
109+
vinsgr2vr.d VX1, t3, 0
110+
vinsgr2vr.d VX1, t4, 1
111+
addi.d I, I, -1
112+
vfmax.d VM2, VX0, VX1
113+
vfmax.d VM3, VM1, VM2
114+
vfmax.d VM0, VM0, VM3
115+
blt $r0, I, .L21
116+
.align 3
117+
118+
.L22:
119+
vreplvei.d x2, VM0, 1
120+
vfmax.d VM0, VM0, x2
121+
.align 3
122+
123+
.L23: //INCX!=1 and N<8
124+
andi I, N, 7
125+
bge $r0, I, .L999
126+
.align 3
127+
128+
.L24:
129+
vld x1, X, 0
130+
addi.d I, I, -1
131+
vfmax.d VM0, VM0, x1
132+
add.d X, X, INCX
133+
blt $r0, I, .L24
134+
.align 3
135+
136+
.L999:
137+
fmov.d $f0, $f22
138+
jirl $r0, $r1, 0x0
139+
.align 3
140+
141+
EPILOGUE

0 commit comments

Comments
 (0)