Skip to content

Commit ff2ecc6

Browse files
yanchengyinshiyou
authored andcommitted
loongarch64: Add optimization for amin.
1 parent 265b5f2 commit ff2ecc6

File tree

6 files changed

+714
-0
lines changed

6 files changed

+714
-0
lines changed

kernel/loongarch64/KERNEL.LOONGSON2K1000

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,4 +10,7 @@ DSCALKERNEL = dscal_lsx.S
1010
SAMAXKERNEL = samax_lsx.S
1111
DAMAXKERNEL = damax_lsx.S
1212

13+
SAMINKERNEL = samin_lsx.S
14+
DAMINKERNEL = damin_lsx.S
15+
1316
endif

kernel/loongarch64/KERNEL.LOONGSON3R5

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@ DSCALKERNEL = dscal_lasx.S
1010
SAMAXKERNEL = samax_lasx.S
1111
DAMAXKERNEL = damax_lasx.S
1212

13+
SAMINKERNEL = samin_lasx.S
14+
DAMINKERNEL = damin_lasx.S
15+
1316
DGEMMKERNEL = dgemm_kernel_16x4.S
1417
DGEMMINCOPY = dgemm_ncopy_16.S
1518
DGEMMITCOPY = dgemm_tcopy_16.S

kernel/loongarch64/damin_lasx.S

Lines changed: 178 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,178 @@
1+
#define ASSEMBLER
2+
3+
#include "common.h"
4+
5+
#define N $r4
6+
#define X $r5
7+
#define INCX $r6
8+
#define I $r12
9+
#define J $r13
10+
#define t1 $r14
11+
#define t2 $r18
12+
#define t3 $r15
13+
#define t4 $r17
14+
#define TEMP $r16
15+
#define m0 $xr8
16+
#define x1 $xr9
17+
#define x2 $xr10
18+
#define x3 $xr11
19+
#define x4 $xr12
20+
#define VX0 $xr20
21+
#define VX1 $xr21
22+
#define VM0 $xr22
23+
#define VM1 $xr23
24+
#define VM2 $xr19
25+
26+
PROLOGUE
27+
28+
bge $r0, N, .L999
29+
bge $r0, INCX, .L999
30+
li.d TEMP, 1
31+
slli.d TEMP, TEMP, BASE_SHIFT
32+
slli.d INCX, INCX, BASE_SHIFT
33+
bne INCX, TEMP, .L20
34+
xvld VM0, X, 0
35+
srai.d I, N, 3
36+
bge $r0, I, .L12
37+
.align 3
38+
39+
.L10:
40+
xvld VX0, X, 0 * SIZE
41+
addi.d I, I, -1
42+
xvld VX1, X, 4 * SIZE
43+
xvfmina.d VM1, VX1, VX0
44+
addi.d X, X, 8 * SIZE
45+
xvfmina.d VM0, VM0, VM1
46+
blt $r0, I, .L10
47+
.align 3
48+
49+
.L11:
50+
xvpickve.d x1, VM0, 0
51+
xvpickve.d x2, VM0, 1
52+
xvpickve.d x3, VM0, 2
53+
xvpickve.d x4, VM0, 3
54+
xvfmina.d VM1, x1, x2
55+
xvfmina.d VM2, x3, x4
56+
xvfmina.d VM0, VM1, VM2
57+
.align 3
58+
59+
.L12: //INCX==1 and N<8
60+
andi I, N, 7
61+
li.d J, 4
62+
bge J, I, .L13 // 4<N<8
63+
xvld VX0, X, 0
64+
slli.d J, J, 1 // 8
65+
sub.d I, J, I
66+
slli.d I, I, BASE_SHIFT
67+
xvldx VX1, X, I
68+
xvfmina.d m0, VX0, VX1 //patial repeat read
69+
xvpickve.d x1, m0, 0
70+
xvpickve.d x2, m0, 1
71+
xvpickve.d x3, m0, 2
72+
xvpickve.d x4, m0, 3
73+
xvfmina.d VM1, x1, x2
74+
xvfmina.d m0, x3, x4
75+
xvfmina.d m0, m0, VM1
76+
xvfmina.d VM0, m0, VM0
77+
fabs.d $f22, $f22
78+
fmov.d $f0, $f22
79+
jirl $r0, $r1, 0x0
80+
.align 3
81+
82+
.L13: //INCX==1 and 0<=N<=4
83+
bge $r0, I, .L15
84+
.align 3
85+
86+
.L14:
87+
xvld x1, X, 0
88+
addi.d I, I, -1
89+
xvfmina.d VM0, VM0, x1
90+
addi.d X, X, SIZE
91+
blt $r0, I, .L14
92+
.align 3
93+
94+
.L15:
95+
fabs.d $f22, $f22
96+
fmov.d $f0, $f22
97+
jirl $r0, $r1, 0x0
98+
.align 3
99+
100+
.L20: // INCX!=1
101+
move TEMP, X // initialize the mina value
102+
ld.d t1, TEMP, 0 * SIZE
103+
add.d TEMP, TEMP, INCX
104+
xvinsgr2vr.d VM0, t1, 0
105+
srai.d I, N, 3
106+
bge $r0, I, .L23
107+
ld.d t2, TEMP, 0 * SIZE
108+
add.d TEMP, TEMP, INCX
109+
ld.d t3, TEMP, 0 * SIZE
110+
add.d TEMP, TEMP, INCX
111+
ld.d t4, TEMP, 0 * SIZE
112+
add.d TEMP, TEMP, INCX
113+
xvinsgr2vr.d VM0, t2, 1
114+
xvinsgr2vr.d VM0, t3, 2
115+
xvinsgr2vr.d VM0, t4, 3
116+
.align 3
117+
118+
.L21:
119+
ld.d t1, X, 0 * SIZE
120+
add.d X, X, INCX
121+
ld.d t2, X, 0 * SIZE
122+
add.d X, X, INCX
123+
ld.d t3, X, 0 * SIZE
124+
add.d X, X, INCX
125+
ld.d t4, X, 0 * SIZE
126+
add.d X, X, INCX
127+
xvinsgr2vr.d VX0, t1, 0
128+
xvinsgr2vr.d VX0, t2, 1
129+
xvinsgr2vr.d VX0, t3, 2
130+
xvinsgr2vr.d VX0, t4, 3
131+
ld.d t1, X, 0 * SIZE
132+
add.d X, X, INCX
133+
ld.d t2, X, 0 * SIZE
134+
add.d X, X, INCX
135+
ld.d t3, X, 0 * SIZE
136+
add.d X, X, INCX
137+
ld.d t4, X, 0 * SIZE
138+
add.d X, X, INCX
139+
xvinsgr2vr.d VX1, t1, 0
140+
xvinsgr2vr.d VX1, t2, 1
141+
xvinsgr2vr.d VX1, t3, 2
142+
xvinsgr2vr.d VX1, t4, 3
143+
addi.d I, I, -1
144+
xvfmina.d VM1, VX1, VX0
145+
xvfmina.d VM0, VM1, VM0
146+
blt $r0, I, .L21
147+
.align 3
148+
149+
.L22:
150+
xvpickve.d x1, VM0, 0
151+
xvpickve.d x2, VM0, 1
152+
xvpickve.d x3, VM0, 2
153+
xvpickve.d x4, VM0, 3
154+
xvfmina.d VM1, x1, x2
155+
xvfmina.d VM2, x3, x4
156+
xvfmina.d VM0, VM1, VM2
157+
.align 3
158+
159+
.L23: //INCX!=1 and N<8
160+
andi I, N, 7
161+
bge $r0, I, .L999
162+
.align 3
163+
164+
.L24:
165+
xvld x1, X, 0
166+
addi.d I, I, -1
167+
xvfmina.d VM0, VM0, x1
168+
add.d X, X, INCX
169+
blt $r0, I, .L24
170+
.align 3
171+
172+
.L999:
173+
fabs.d $f22, $f22
174+
fmov.d $f0, $f22
175+
jirl $r0, $r1, 0x0
176+
.align 3
177+
178+
EPILOGUE

kernel/loongarch64/damin_lsx.S

Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
#define ASSEMBLER
2+
3+
#include "common.h"
4+
5+
#define N $r4
6+
#define X $r5
7+
#define INCX $r6
8+
#define I $r12
9+
#define t1 $r14
10+
#define t2 $r18
11+
#define t3 $r15
12+
#define t4 $r17
13+
#define TEMP $r16
14+
#define x1 $vr9
15+
#define x2 $vr10
16+
#define VX0 $vr20
17+
#define VX1 $vr21
18+
#define VM0 $vr22
19+
#define VM1 $vr23
20+
#define VM2 $vr18
21+
#define VM3 $vr19
22+
23+
PROLOGUE
24+
25+
bge $r0, N, .L999
26+
bge $r0, INCX, .L999
27+
li.d TEMP, 1
28+
slli.d TEMP, TEMP, BASE_SHIFT
29+
slli.d INCX, INCX, BASE_SHIFT
30+
bne INCX, TEMP, .L20
31+
vld VM0, X, 0
32+
srai.d I, N, 3
33+
bge $r0, I, .L12
34+
.align 3
35+
36+
.L10:
37+
vld VX0, X, 0 * SIZE
38+
vld VX1, X, 2 * SIZE
39+
addi.d I, I, -1
40+
vfmina.d VM1, VX1, VX0
41+
vld VX0, X, 4 * SIZE
42+
vld VX1, X, 6 * SIZE
43+
vfmina.d VM2, VX1, VX0
44+
vfmina.d VM3, VM1, VM2
45+
addi.d X, X, 8 * SIZE
46+
vfmina.d VM0, VM0, VM3
47+
blt $r0, I, .L10
48+
.align 3
49+
50+
.L11:
51+
vreplvei.d x1, VM0, 0
52+
vreplvei.d x2, VM0, 1
53+
vfmina.d VM0, x1, x2
54+
.align 3
55+
56+
.L12: //INCX==1 and N<8
57+
andi I, N, 7
58+
bge $r0, I, .L14
59+
.align 3
60+
61+
.L13:
62+
vld x1, X, 0
63+
addi.d I, I, -1
64+
vfmina.d VM0, VM0, x1
65+
addi.d X, X, SIZE
66+
blt $r0, I, .L13
67+
.align 3
68+
69+
.L14:
70+
fabs.d $f22, $f22
71+
fmov.d $f0, $f22
72+
jirl $r0, $r1, 0x0
73+
.align 3
74+
75+
.L20: // INCX!=1
76+
move TEMP, X // initialize the mina value
77+
ld.d t1, TEMP, 0 * SIZE
78+
add.d TEMP, TEMP, INCX
79+
vinsgr2vr.d VM0, t1, 0
80+
srai.d I, N, 3
81+
bge $r0, I, .L23
82+
ld.d t2, TEMP, 0 * SIZE
83+
add.d TEMP, TEMP, INCX
84+
vinsgr2vr.d VM0, t2, 1
85+
.align 3
86+
87+
.L21:
88+
ld.d t1, X, 0 * SIZE
89+
add.d X, X, INCX
90+
ld.d t2, X, 0 * SIZE
91+
add.d X, X, INCX
92+
vinsgr2vr.d VX0, t1, 0
93+
vinsgr2vr.d VX0, t2, 1
94+
ld.d t3, X, 0 * SIZE
95+
add.d X, X, INCX
96+
ld.d t4, X, 0 * SIZE
97+
add.d X, X, INCX
98+
vinsgr2vr.d VX1, t3, 0
99+
vinsgr2vr.d VX1, t4, 1
100+
vfmina.d VM1, VX0, VX1
101+
ld.d t1, X, 0 * SIZE
102+
add.d X, X, INCX
103+
ld.d t2, X, 0 * SIZE
104+
add.d X, X, INCX
105+
vinsgr2vr.d VX0, t1, 0
106+
vinsgr2vr.d VX0, t2, 1
107+
ld.d t3, X, 0 * SIZE
108+
add.d X, X, INCX
109+
ld.d t4, X, 0 * SIZE
110+
add.d X, X, INCX
111+
vinsgr2vr.d VX1, t3, 0
112+
vinsgr2vr.d VX1, t4, 1
113+
addi.d I, I, -1
114+
vfmina.d VM2, VX0, VX1
115+
vfmina.d VM3, VM1, VM2
116+
vfmina.d VM0, VM0, VM3
117+
blt $r0, I, .L21
118+
.align 3
119+
120+
.L22:
121+
vreplvei.d x1, VM0, 0
122+
vreplvei.d x2, VM0, 1
123+
vfmina.d VM0, x1, x2
124+
.align 3
125+
126+
.L23: //INCX!=1 and N<8
127+
andi I, N, 7
128+
bge $r0, I, .L999
129+
.align 3
130+
131+
.L24:
132+
vld x1, X, 0
133+
vfmina.d VM0, VM0, x1
134+
addi.d I, I, -1
135+
add.d X, X, INCX
136+
blt $r0, I, .L24
137+
.align 3
138+
139+
.L999:
140+
fabs.d $f22, $f22
141+
fmov.d $f0, $f22
142+
jirl $r0, $r1, 0x0
143+
.align 3
144+
145+
EPILOGUE

0 commit comments

Comments
 (0)