Skip to content

Commit 265b5f2

Browse files
yanchengyinshiyou
authored andcommitted
loongarch64: Add optimizations for amax.
1 parent 993ede7 commit 265b5f2

File tree

6 files changed

+719
-0
lines changed

6 files changed

+719
-0
lines changed

kernel/loongarch64/KERNEL.LOONGSON2K1000

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,7 @@ DDOTKERNEL = dot_lsx.S
77
SSCALKERNEL = sscal_lsx.S
88
DSCALKERNEL = dscal_lsx.S
99

10+
SAMAXKERNEL = samax_lsx.S
11+
DAMAXKERNEL = damax_lsx.S
12+
1013
endif

kernel/loongarch64/KERNEL.LOONGSON3R5

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@ DDOTKERNEL = dot_lasx.S
77
SSCALKERNEL = sscal_lasx.S
88
DSCALKERNEL = dscal_lasx.S
99

10+
SAMAXKERNEL = samax_lasx.S
11+
DAMAXKERNEL = damax_lasx.S
12+
1013
DGEMMKERNEL = dgemm_kernel_16x4.S
1114
DGEMMINCOPY = dgemm_ncopy_16.S
1215
DGEMMITCOPY = dgemm_tcopy_16.S

kernel/loongarch64/damax_lasx.S

Lines changed: 183 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,183 @@
1+
#define ASSEMBLER
2+
3+
#include "common.h"
4+
5+
#define N $r4
6+
#define X $r5
7+
#define INCX $r6
8+
#define I $r12
9+
#define J $r13
10+
#define t1 $r14
11+
#define t2 $r18
12+
#define t3 $r15
13+
#define t4 $r17
14+
#define TEMP $r16
15+
#define m0 $xr8
16+
#define x1 $xr9
17+
#define x2 $xr10
18+
#define x3 $xr11
19+
#define x4 $xr12
20+
#define x5 $xr13
21+
#define x6 $xr14
22+
#define x7 $xr15
23+
#define x8 $xr16
24+
#define VX0 $xr20
25+
#define VX1 $xr21
26+
#define VM0 $xr22
27+
#define VM1 $xr23
28+
#define VM2 $xr18
29+
#define VM3 $xr19
30+
31+
PROLOGUE
32+
33+
bge $r0, N, .L999
34+
bge $r0, INCX, .L999
35+
li.d TEMP, 1
36+
slli.d TEMP, TEMP, BASE_SHIFT
37+
slli.d INCX, INCX, BASE_SHIFT
38+
bne INCX, TEMP, .L20
39+
xvld VM0, X, 0
40+
srai.d I, N, 3
41+
bge $r0, I, .L12
42+
.align 3
43+
44+
.L10:
45+
xvld VX0, X, 0 * SIZE
46+
xvld VX1, X, 4 * SIZE
47+
addi.d I, I, -1
48+
xvfmaxa.d VM1, VX1, VX0
49+
addi.d X, X, 8 * SIZE
50+
xvfmaxa.d VM0, VM0, VM1
51+
blt $r0, I, .L10
52+
.align 3
53+
54+
.L11:
55+
xvpickve.d x1, VM0, 0
56+
xvpickve.d x2, VM0, 1
57+
xvpickve.d x3, VM0, 2
58+
xvpickve.d x4, VM0, 3
59+
xvfmaxa.d VM1, x1, x2
60+
xvfmaxa.d VM2, x3, x4
61+
xvfmaxa.d VM0, VM1, VM2
62+
.align 3
63+
64+
.L12: //INCX==1 and N<8
65+
andi I, N, 7
66+
li.d J, 4
67+
bge J, I, .L13 // 4<N<8
68+
xvld VX0, X, 0
69+
slli.d J, J, 1 // 8
70+
sub.d I, J, I
71+
slli.d I, I, BASE_SHIFT
72+
xvldx VX1, X, I
73+
xvfmaxa.d m0, VX0, VX1 //patial repeat read
74+
xvpickve.d x1, m0, 0
75+
xvpickve.d x2, m0, 1
76+
xvpickve.d x3, m0, 2
77+
xvpickve.d x4, m0, 3
78+
xvfmaxa.d VM1, x1, x2
79+
xvfmaxa.d m0, x3, x4
80+
xvfmaxa.d m0, m0, VM1
81+
xvfmaxa.d VM0, m0, VM0
82+
fabs.d $f22, $f22
83+
fmov.d $f0, $f22
84+
jirl $r0, $r1, 0x0
85+
.align 3
86+
87+
.L13: //INCX==1 and 0<=N<=4
88+
bge $r0, I, .L15
89+
.align 3
90+
91+
.L14:
92+
xvld x1, X, 0
93+
addi.d I, I, -1
94+
xvfmaxa.d VM0, VM0, x1
95+
addi.d X, X, SIZE
96+
blt $r0, I, .L14
97+
.align 3
98+
99+
.L15:
100+
fabs.d $f22, $f22
101+
fmov.d $f0, $f22
102+
jirl $r0, $r1, 0x0
103+
.align 3
104+
105+
.L20: // INCX!=1
106+
move TEMP, X // initialize the maxa value
107+
ld.d t1, TEMP, 0 * SIZE
108+
add.d TEMP, TEMP, INCX
109+
xvinsgr2vr.d VM0, t1, 0
110+
srai.d I, N, 3
111+
bge $r0, I, .L23
112+
ld.d t2, TEMP, 0 * SIZE
113+
add.d TEMP, TEMP, INCX
114+
ld.d t3, TEMP, 0 * SIZE
115+
add.d TEMP, TEMP, INCX
116+
ld.d t4, TEMP, 0 * SIZE
117+
add.d TEMP, TEMP, INCX
118+
xvinsgr2vr.d VM0, t2, 1
119+
xvinsgr2vr.d VM0, t3, 2
120+
xvinsgr2vr.d VM0, t4, 3
121+
.align 3
122+
123+
.L21:
124+
ld.d t1, X, 0 * SIZE
125+
add.d X, X, INCX
126+
ld.d t2, X, 0 * SIZE
127+
add.d X, X, INCX
128+
ld.d t3, X, 0 * SIZE
129+
add.d X, X, INCX
130+
ld.d t4, X, 0 * SIZE
131+
add.d X, X, INCX
132+
xvinsgr2vr.d VX0, t1, 0
133+
xvinsgr2vr.d VX0, t2, 1
134+
xvinsgr2vr.d VX0, t3, 2
135+
xvinsgr2vr.d VX0, t4, 3
136+
ld.d t1, X, 0 * SIZE
137+
add.d X, X, INCX
138+
ld.d t2, X, 0 * SIZE
139+
add.d X, X, INCX
140+
ld.d t3, X, 0 * SIZE
141+
add.d X, X, INCX
142+
ld.d t4, X, 0 * SIZE
143+
add.d X, X, INCX
144+
xvinsgr2vr.d VX1, t1, 0
145+
xvinsgr2vr.d VX1, t2, 1
146+
xvinsgr2vr.d VX1, t3, 2
147+
xvinsgr2vr.d VX1, t4, 3
148+
addi.d I, I, -1
149+
xvfmaxa.d VM1, VX1, VX0
150+
xvfmaxa.d VM0, VM1, VM0
151+
blt $r0, I, .L21
152+
.align 3
153+
154+
.L22:
155+
xvpickve.d x1, VM0, 0
156+
xvpickve.d x2, VM0, 1
157+
xvpickve.d x3, VM0, 2
158+
xvpickve.d x4, VM0, 3
159+
xvfmaxa.d VM1, x1, x2
160+
xvfmaxa.d VM2, x3, x4
161+
xvfmaxa.d VM0, VM1, VM2
162+
.align 3
163+
164+
.L23: //INCX!=1 and N<8
165+
andi I, N, 7
166+
bge $r0, I, .L999
167+
.align 3
168+
169+
.L24:
170+
xvld x1, X, 0
171+
addi.d I, I, -1
172+
xvfmaxa.d VM0, VM0, x1
173+
add.d X, X, INCX
174+
blt $r0, I, .L24
175+
.align 3
176+
177+
.L999:
178+
fabs.d $f22, $f22
179+
fmov.d $f0, $f22
180+
jirl $r0, $r1, 0x0
181+
.align 3
182+
183+
EPILOGUE

kernel/loongarch64/damax_lsx.S

Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
#define ASSEMBLER
2+
3+
#include "common.h"
4+
5+
#define N $r4
6+
#define X $r5
7+
#define INCX $r6
8+
#define I $r12
9+
#define t1 $r14
10+
#define t2 $r18
11+
#define t3 $r15
12+
#define t4 $r17
13+
#define TEMP $r16
14+
#define x1 $vr9
15+
#define x2 $vr10
16+
#define VX0 $vr20
17+
#define VX1 $vr21
18+
#define VM0 $vr22
19+
#define VM1 $vr23
20+
#define VM2 $vr18
21+
#define VM3 $vr19
22+
23+
PROLOGUE
24+
25+
bge $r0, N, .L999
26+
bge $r0, INCX, .L999
27+
li.d TEMP, 1
28+
slli.d TEMP, TEMP, BASE_SHIFT
29+
slli.d INCX, INCX, BASE_SHIFT
30+
bne INCX, TEMP, .L20
31+
vld VM0, X, 0
32+
srai.d I, N, 3
33+
bge $r0, I, .L12
34+
.align 3
35+
36+
.L10:
37+
vld VX0, X, 0 * SIZE
38+
vld VX1, X, 2 * SIZE
39+
addi.d I, I, -1
40+
vfmaxa.d VM1, VX1, VX0
41+
vld VX0, X, 4 * SIZE
42+
vld VX1, X, 6 * SIZE
43+
vfmaxa.d VM2, VX1, VX0
44+
vfmaxa.d VM3, VM1, VM2
45+
addi.d X, X, 8 * SIZE
46+
vfmaxa.d VM0, VM0, VM3
47+
blt $r0, I, .L10
48+
.align 3
49+
50+
.L11:
51+
vreplvei.d x1, VM0, 0
52+
vreplvei.d x2, VM0, 1
53+
vfmaxa.d VM0, x1, x2
54+
.align 3
55+
56+
.L12: //INCX==1 and N<8
57+
andi I, N, 7
58+
bge $r0, I, .L14
59+
.align 3
60+
61+
.L13:
62+
vld x1, X, 0
63+
addi.d I, I, -1
64+
vfmaxa.d VM0, VM0, x1
65+
addi.d X, X, SIZE
66+
blt $r0, I, .L13
67+
.align 3
68+
69+
.L14:
70+
fabs.d $f22, $f22
71+
fmov.d $f0, $f22
72+
jirl $r0, $r1, 0x0
73+
.align 3
74+
75+
.L20: // INCX!=1
76+
move TEMP, X // initialize the maxa value
77+
ld.d t1, TEMP, 0 * SIZE
78+
add.d TEMP, TEMP, INCX
79+
vinsgr2vr.d VM0, t1, 0
80+
srai.d I, N, 3
81+
bge $r0, I, .L23
82+
ld.d t2, TEMP, 0 * SIZE
83+
add.d TEMP, TEMP, INCX
84+
vinsgr2vr.d VM0, t2, 1
85+
.align 3
86+
87+
.L21:
88+
ld.d t1, X, 0 * SIZE
89+
add.d X, X, INCX
90+
ld.d t2, X, 0 * SIZE
91+
add.d X, X, INCX
92+
vinsgr2vr.d VX0, t1, 0
93+
vinsgr2vr.d VX0, t2, 1
94+
ld.d t3, X, 0 * SIZE
95+
add.d X, X, INCX
96+
ld.d t4, X, 0 * SIZE
97+
add.d X, X, INCX
98+
vinsgr2vr.d VX1, t3, 0
99+
vinsgr2vr.d VX1, t4, 1
100+
vfmaxa.d VM1, VX0, VX1
101+
ld.d t1, X, 0 * SIZE
102+
add.d X, X, INCX
103+
ld.d t2, X, 0 * SIZE
104+
add.d X, X, INCX
105+
vinsgr2vr.d VX0, t1, 0
106+
vinsgr2vr.d VX0, t2, 1
107+
ld.d t3, X, 0 * SIZE
108+
add.d X, X, INCX
109+
ld.d t4, X, 0 * SIZE
110+
add.d X, X, INCX
111+
vinsgr2vr.d VX1, t3, 0
112+
vinsgr2vr.d VX1, t4, 1
113+
vfmaxa.d VM2, VX0, VX1
114+
vfmaxa.d VM3, VM1, VM2
115+
vfmaxa.d VM0, VM0, VM3
116+
addi.d I, I, -1
117+
blt $r0, I, .L21
118+
.align 3
119+
120+
.L22:
121+
vreplvei.d x1, VM0, 0
122+
vreplvei.d x2, VM0, 1
123+
vfmaxa.d VM0, x1, x2
124+
.align 3
125+
126+
.L23: //INCX!=1 and N<8
127+
andi I, N, 7
128+
bge $r0, I, .L999
129+
.align 3
130+
131+
.L24:
132+
vld x1, X, 0
133+
addi.d I, I, -1
134+
vfmaxa.d VM0, VM0, x1
135+
add.d X, X, INCX
136+
blt $r0, I, .L24
137+
.align 3
138+
139+
.L999:
140+
fabs.d $f22, $f22
141+
fmov.d $f0, $f22
142+
jirl $r0, $r1, 0x0
143+
.align 3
144+
145+
EPILOGUE

0 commit comments

Comments
 (0)