Skip to content

Commit 702fc1d

Browse files
yanchengyinshiyou
authored andcommitted
loongarch64: Add optimization for min.
1 parent 346b384 commit 702fc1d

File tree

6 files changed

+703
-0
lines changed

6 files changed

+703
-0
lines changed

kernel/loongarch64/KERNEL.LOONGSON2K1000

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,4 +16,7 @@ DAMINKERNEL = damin_lsx.S
1616
SMAXKERNEL = smax_lsx.S
1717
DMAXKERNEL = dmax_lsx.S
1818

19+
SMINKERNEL = smin_lsx.S
20+
DMINKERNEL = dmin_lsx.S
21+
1922
endif

kernel/loongarch64/KERNEL.LOONGSON3R5

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,9 @@ DAMINKERNEL = damin_lasx.S
1616
SMAXKERNEL = smax_lasx.S
1717
DMAXKERNEL = dmax_lasx.S
1818

19+
SMINKERNEL = smin_lasx.S
20+
DMINKERNEL = dmin_lasx.S
21+
1922
DGEMMKERNEL = dgemm_kernel_16x4.S
2023
DGEMMINCOPY = dgemm_ncopy_16.S
2124
DGEMMITCOPY = dgemm_tcopy_16.S

kernel/loongarch64/dmin_lasx.S

Lines changed: 175 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,175 @@
1+
#define ASSEMBLER
2+
3+
#include "common.h"
4+
5+
#define N $r4
6+
#define X $r5
7+
#define INCX $r6
8+
#define I $r12
9+
#define J $r13
10+
#define t1 $r14
11+
#define t2 $r18
12+
#define t3 $r15
13+
#define t4 $r17
14+
#define TEMP $r16
15+
#define m0 $xr8
16+
#define x1 $xr9
17+
#define x2 $xr10
18+
#define x3 $xr11
19+
#define x4 $xr12
20+
#define VX0 $xr20
21+
#define VX1 $xr21
22+
#define VM0 $xr22
23+
#define VM1 $xr23
24+
#define VM2 $xr19
25+
26+
PROLOGUE
27+
28+
bge $r0, N, .L999
29+
bge $r0, INCX, .L999
30+
li.d TEMP, 1
31+
slli.d TEMP, TEMP, BASE_SHIFT
32+
slli.d INCX, INCX, BASE_SHIFT
33+
bne INCX, TEMP, .L20
34+
xvld VM0, X, 0
35+
srai.d I, N, 3
36+
bge $r0, I, .L12
37+
.align 3
38+
39+
.L10:
40+
xvld VX0, X, 0 * SIZE
41+
xvld VX1, X, 4 * SIZE
42+
addi.d I, I, -1
43+
xvfmin.d VM1, VX1, VX0
44+
addi.d X, X, 8 * SIZE
45+
xvfmin.d VM0, VM0, VM1
46+
blt $r0, I, .L10
47+
.align 3
48+
49+
.L11:
50+
xvpickve.d x1, VM0, 0
51+
xvpickve.d x2, VM0, 1
52+
xvpickve.d x3, VM0, 2
53+
xvpickve.d x4, VM0, 3
54+
xvfmin.d VM1, x1, x2
55+
xvfmin.d VM2, x3, x4
56+
xvfmin.d VM0, VM1, VM2
57+
.align 3
58+
59+
.L12: //INCX==1 and N<8
60+
andi I, N, 7
61+
li.d J, 4
62+
bge J, I, .L13 // 4<N<8
63+
xvld VX0, X, 0
64+
slli.d J, J, 1 // 8
65+
sub.d I, J, I
66+
slli.d I, I, BASE_SHIFT
67+
xvldx VX1, X, I
68+
xvfmin.d m0, VX0, VX1 //patial repeat read
69+
xvpickve.d x1, m0, 0
70+
xvpickve.d x2, m0, 1
71+
xvpickve.d x3, m0, 2
72+
xvpickve.d x4, m0, 3
73+
xvfmin.d VM1, x1, x2
74+
xvfmin.d m0, x3, x4
75+
xvfmin.d m0, m0, VM1
76+
xvfmin.d VM0, m0, VM0
77+
fmov.d $f0, $f22
78+
jirl $r0, $r1, 0x0
79+
.align 3
80+
81+
.L13: //INCX==1 and 0<=N<=4
82+
bge $r0, I, .L15
83+
.align 3
84+
85+
.L14:
86+
xvld x1, X, 0
87+
xvfmin.d VM0, VM0, x1
88+
addi.d I, I, -1
89+
addi.d X, X, SIZE
90+
blt $r0, I, .L14
91+
.align 3
92+
93+
.L15:
94+
fmov.d $f0, $f22
95+
jirl $r0, $r1, 0x0
96+
.align 3
97+
98+
.L20: // INCX!=1
99+
move TEMP, X // initialize the min value
100+
ld.d t1, TEMP, 0 * SIZE
101+
add.d TEMP, TEMP, INCX
102+
xvinsgr2vr.d VM0, t1, 0
103+
srai.d I, N, 3
104+
bge $r0, I, .L23
105+
ld.d t2, TEMP, 0 * SIZE
106+
add.d TEMP, TEMP, INCX
107+
ld.d t3, TEMP, 0 * SIZE
108+
add.d TEMP, TEMP, INCX
109+
ld.d t4, TEMP, 0 * SIZE
110+
add.d TEMP, TEMP, INCX
111+
xvinsgr2vr.d VM0, t2, 1
112+
xvinsgr2vr.d VM0, t3, 2
113+
xvinsgr2vr.d VM0, t4, 3
114+
.align 3
115+
116+
.L21:
117+
ld.d t1, X, 0 * SIZE
118+
add.d X, X, INCX
119+
ld.d t2, X, 0 * SIZE
120+
add.d X, X, INCX
121+
ld.d t3, X, 0 * SIZE
122+
add.d X, X, INCX
123+
ld.d t4, X, 0 * SIZE
124+
add.d X, X, INCX
125+
xvinsgr2vr.d VX0, t1, 0
126+
xvinsgr2vr.d VX0, t2, 1
127+
xvinsgr2vr.d VX0, t3, 2
128+
xvinsgr2vr.d VX0, t4, 3
129+
ld.d t1, X, 0 * SIZE
130+
add.d X, X, INCX
131+
ld.d t2, X, 0 * SIZE
132+
add.d X, X, INCX
133+
ld.d t3, X, 0 * SIZE
134+
add.d X, X, INCX
135+
ld.d t4, X, 0 * SIZE
136+
add.d X, X, INCX
137+
xvinsgr2vr.d VX1, t1, 0
138+
xvinsgr2vr.d VX1, t2, 1
139+
xvinsgr2vr.d VX1, t3, 2
140+
xvinsgr2vr.d VX1, t4, 3
141+
addi.d I, I, -1
142+
xvfmin.d VM1, VX1, VX0
143+
xvfmin.d VM0, VM1, VM0
144+
blt $r0, I, .L21
145+
.align 3
146+
147+
.L22:
148+
xvpickve.d x1, VM0, 0
149+
xvpickve.d x2, VM0, 1
150+
xvpickve.d x3, VM0, 2
151+
xvpickve.d x4, VM0, 3
152+
xvfmin.d VM1, x1, x2
153+
xvfmin.d VM2, x3, x4
154+
xvfmin.d VM0, VM1, VM2
155+
.align 3
156+
157+
.L23: //INCX!=1 and N<8
158+
andi I, N, 7
159+
bge $r0, I, .L999
160+
.align 3
161+
162+
.L24:
163+
xvld x1, X, 0
164+
xvfmin.d VM0, VM0, x1
165+
addi.d I, I, -1
166+
add.d X, X, INCX
167+
blt $r0, I, .L24
168+
.align 3
169+
170+
.L999:
171+
fmov.d $f0, $f22
172+
jirl $r0, $r1, 0x0
173+
.align 3
174+
175+
EPILOGUE

kernel/loongarch64/dmin_lsx.S

Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
#define ASSEMBLER
2+
3+
#include "common.h"
4+
5+
#define N $r4
6+
#define X $r5
7+
#define INCX $r6
8+
#define I $r12
9+
#define t1 $r14
10+
#define t2 $r18
11+
#define t3 $r15
12+
#define t4 $r17
13+
#define TEMP $r16
14+
#define x1 $vr9
15+
#define x2 $vr10
16+
#define VX0 $vr20
17+
#define VX1 $vr21
18+
#define VM0 $vr22
19+
#define VM1 $vr23
20+
#define VM2 $vr18
21+
#define VM3 $vr19
22+
23+
PROLOGUE
24+
25+
bge $r0, N, .L999
26+
bge $r0, INCX, .L999
27+
li.d TEMP, 1
28+
slli.d TEMP, TEMP, BASE_SHIFT
29+
slli.d INCX, INCX, BASE_SHIFT
30+
bne INCX, TEMP, .L20
31+
vld VM0, X, 0
32+
srai.d I, N, 3
33+
bge $r0, I, .L12
34+
.align 3
35+
36+
.L10:
37+
vld VX0, X, 0 * SIZE
38+
vld VX1, X, 2 * SIZE
39+
addi.d I, I, -1
40+
vfmin.d VM1, VX1, VX0
41+
vld VX0, X, 4 * SIZE
42+
vld VX1, X, 6 * SIZE
43+
vfmin.d VM2, VX1, VX0
44+
vfmin.d VM3, VM1, VM2
45+
addi.d X, X, 8 * SIZE
46+
vfmin.d VM0, VM0, VM3
47+
blt $r0, I, .L10
48+
.align 3
49+
50+
.L11:
51+
vreplvei.d x1, VM0, 0
52+
vreplvei.d x2, VM0, 1
53+
vfmin.d VM0, x1, x2
54+
.align 3
55+
56+
.L12: //INCX==1 and N<8
57+
andi I, N, 7
58+
bge $r0, I, .L14
59+
.align 3
60+
61+
.L13:
62+
vld x1, X, 0
63+
addi.d I, I, -1
64+
vfmin.d VM0, VM0, x1
65+
addi.d X, X, SIZE
66+
blt $r0, I, .L13
67+
.align 3
68+
69+
.L14:
70+
fmov.d $f0, $f22
71+
jirl $r0, $r1, 0x0
72+
.align 3
73+
74+
.L20: // INCX!=1
75+
move TEMP, X // initialize the min value
76+
ld.d t1, TEMP, 0 * SIZE
77+
add.d TEMP, TEMP, INCX
78+
vinsgr2vr.d VM0, t1, 0
79+
srai.d I, N, 3
80+
bge $r0, I, .L23
81+
ld.d t2, TEMP, 0 * SIZE
82+
add.d TEMP, TEMP, INCX
83+
vinsgr2vr.d VM0, t2, 1
84+
.align 3
85+
86+
.L21:
87+
ld.d t1, X, 0 * SIZE
88+
add.d X, X, INCX
89+
ld.d t2, X, 0 * SIZE
90+
add.d X, X, INCX
91+
vinsgr2vr.d VX0, t1, 0
92+
vinsgr2vr.d VX0, t2, 1
93+
ld.d t3, X, 0 * SIZE
94+
add.d X, X, INCX
95+
ld.d t4, X, 0 * SIZE
96+
add.d X, X, INCX
97+
vinsgr2vr.d VX1, t3, 0
98+
vinsgr2vr.d VX1, t4, 1
99+
vfmin.d VM1, VX0, VX1
100+
ld.d t1, X, 0 * SIZE
101+
add.d X, X, INCX
102+
ld.d t2, X, 0 * SIZE
103+
add.d X, X, INCX
104+
vinsgr2vr.d VX0, t1, 0
105+
vinsgr2vr.d VX0, t2, 1
106+
ld.d t3, X, 0 * SIZE
107+
add.d X, X, INCX
108+
ld.d t4, X, 0 * SIZE
109+
add.d X, X, INCX
110+
vinsgr2vr.d VX1, t3, 0
111+
vinsgr2vr.d VX1, t4, 1
112+
addi.d I, I, -1
113+
vfmin.d VM2, VX0, VX1
114+
vfmin.d VM3, VM1, VM2
115+
vfmin.d VM0, VM0, VM3
116+
blt $r0, I, .L21
117+
.align 3
118+
119+
.L22:
120+
vreplvei.d x1, VM0, 0
121+
vreplvei.d x2, VM0, 1
122+
vfmin.d VM0, x1, x2
123+
.align 3
124+
125+
.L23: //INCX!=1 and N<8
126+
andi I, N, 7
127+
bge $r0, I, .L999
128+
.align 3
129+
130+
.L24:
131+
vld x1, X, 0
132+
addi.d I, I, -1
133+
vfmin.d VM0, VM0, x1
134+
add.d X, X, INCX
135+
blt $r0, I, .L24
136+
.align 3
137+
138+
.L999:
139+
fmov.d $f0, $f22
140+
jirl $r0, $r1, 0x0
141+
.align 3
142+
143+
EPILOGUE

0 commit comments

Comments
 (0)