Skip to content

Commit c80e7e2

Browse files
yanchengyinshiyou
authored andcommitted
loongarch64: Add optimizations for sum and asum.
1 parent d4c96a3 commit c80e7e2

File tree

10 files changed

+1136
-0
lines changed

10 files changed

+1136
-0
lines changed

kernel/loongarch64/KERNEL.LOONGSON2K1000

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,4 +43,10 @@ DAXPYKERNEL = daxpy_lsx.S
4343
SAXPBYKERNEL = saxpby_lsx.S
4444
DAXPBYKERNEL = daxpby_lsx.S
4545

46+
SSUMKERNEL = ssum_lsx.S
47+
DSUMKERNEL = dsum_lsx.S
48+
49+
SASUMKERNEL = sasum_lsx.S
50+
DASUMKERNEL = dasum_lsx.S
51+
4652
endif

kernel/loongarch64/KERNEL.LOONGSON3R5

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,12 @@ DAXPYKERNEL = daxpy_lasx.S
4343
SAXPBYKERNEL = saxpby_lasx.S
4444
DAXPBYKERNEL = daxpby_lasx.S
4545

46+
SSUMKERNEL = ssum_lasx.S
47+
DSUMKERNEL = dsum_lasx.S
48+
49+
SASUMKERNEL = sasum_lasx.S
50+
DASUMKERNEL = dasum_lasx.S
51+
4652
DGEMMKERNEL = dgemm_kernel_16x4.S
4753
DGEMMINCOPY = dgemm_ncopy_16.S
4854
DGEMMITCOPY = dgemm_tcopy_16.S

kernel/loongarch64/dasum_lasx.S

Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
#define ASSEMBLER
2+
#include "common.h"
3+
#define N $r4
4+
#define X $r5
5+
#define INCX $r6
6+
#define I $r17
7+
#define TEMP $r18
8+
#define t1 $r15
9+
#define t2 $r12
10+
#define t3 $r13
11+
#define t4 $r14
12+
#define VX0 $xr12
13+
#define VX1 $xr13
14+
#define VX2 $xr14
15+
#define VX3 $xr15
16+
#define VT0 $xr23
17+
#define VT1 $xr22
18+
#define res1 $xr16
19+
#define res2 $xr17
20+
#define res0 $xr18
21+
#define neg1 $xr19
22+
23+
PROLOGUE
24+
xvxor.v res1, res1, res1
25+
xvxor.v res2, res2, res2
26+
xvxor.v res0, res0, res0
27+
bge $r0, N, .L999
28+
bge $r0, INCX, .L999
29+
li.d t1, -1
30+
xvreplgr2vr.d neg1, t1
31+
xvffint.d.l neg1, neg1
32+
li.d TEMP, SIZE
33+
slli.d INCX, INCX, BASE_SHIFT
34+
srai.d I, N, 3
35+
bne INCX, TEMP, .L20
36+
bge $r0, I, .L13
37+
.align 3
38+
39+
.L11:
40+
xvld VX0, X, 0 * SIZE
41+
xvld VX1, X, 4 * SIZE
42+
xvfmul.d VX2, neg1, VX0
43+
xvfmul.d VX3, neg1, VX1
44+
xvfcmp.clt.d VT0, VX0, res0
45+
xvfcmp.clt.d VT1, VX1, res0
46+
xvbitsel.v VX0, VX0, VX2, VT0
47+
xvbitsel.v VX1, VX1, VX3, VT1
48+
xvfadd.d res2, VX0, VX1
49+
xvfadd.d res1, res1, res2
50+
addi.d X, X, 8 * SIZE
51+
addi.d I, I, -1
52+
blt $r0, I, .L11
53+
.align 3
54+
55+
.L12:
56+
xvpickve.d VX1, res1, 1
57+
xvpickve.d VX2, res1, 2
58+
xvpickve.d VX3, res1, 3
59+
xvfadd.d res1, VX1, res1
60+
xvfadd.d res1, VX2, res1
61+
xvfadd.d res1, VX3, res1
62+
.align 3
63+
64+
.L13:
65+
andi I, N, 7
66+
bge $r0, I, .L999
67+
.align 3
68+
69+
.L14:
70+
fld.d $f12, X, 0 * SIZE
71+
fabs.d $f12, $f12
72+
fadd.d $f16, $f12, $f16
73+
addi.d I, I, -1
74+
addi.d X, X, SIZE
75+
blt $r0, I, .L14
76+
b .L999
77+
.align 3
78+
79+
.L20:
80+
bge $r0, I, .L23
81+
.align 3
82+
83+
.L21:
84+
ld.d t1, X, 0 * SIZE
85+
add.d X, X, INCX
86+
ld.d t2, X, 0 * SIZE
87+
add.d X, X, INCX
88+
ld.d t3, X, 0 * SIZE
89+
add.d X, X, INCX
90+
ld.d t4, X, 0 * SIZE
91+
add.d X, X, INCX
92+
xvinsgr2vr.d VX0, t1, 0
93+
xvinsgr2vr.d VX0, t2, 1
94+
xvinsgr2vr.d VX0, t3, 2
95+
xvinsgr2vr.d VX0, t4, 3
96+
ld.d t1, X, 0 * SIZE
97+
add.d X, X, INCX
98+
ld.d t2, X, 0 * SIZE
99+
add.d X, X, INCX
100+
ld.d t3, X, 0 * SIZE
101+
add.d X, X, INCX
102+
ld.d t4, X, 0 * SIZE
103+
add.d X, X, INCX
104+
xvinsgr2vr.d VX1, t1, 0
105+
xvinsgr2vr.d VX1, t2, 1
106+
xvinsgr2vr.d VX1, t3, 2
107+
xvinsgr2vr.d VX1, t4, 3
108+
xvfmul.d VX2, neg1, VX0
109+
xvfmul.d VX3, neg1, VX1
110+
xvfcmp.clt.d VT0, VX0, res0
111+
xvfcmp.clt.d VT1, VX1, res0
112+
xvbitsel.v VX0, VX0, VX2, VT0
113+
xvbitsel.v VX1, VX1, VX3, VT1
114+
xvfadd.d res2, VX0, VX1
115+
xvfadd.d res1, res1, res2
116+
addi.d I, I, -1
117+
blt $r0, I, .L21
118+
.align 3
119+
120+
.L22:
121+
xvpickve.d VX1, res1, 1
122+
xvpickve.d VX2, res1, 2
123+
xvpickve.d VX3, res1, 3
124+
xvfadd.d res1, VX1, res1
125+
xvfadd.d res1, VX2, res1
126+
xvfadd.d res1, VX3, res1
127+
.align 3
128+
129+
.L23:
130+
andi I, N, 7
131+
bge $r0, I, .L999
132+
.align 3
133+
134+
.L24:
135+
fld.d $f12, X, 0 * SIZE
136+
fabs.d $f12, $f12
137+
fadd.d $f16, $f12, $f16
138+
addi.d I, I, -1
139+
add.d X, X, INCX
140+
blt $r0, I, .L24
141+
.align 3
142+
143+
.L999:
144+
fmov.d $f0, $f16
145+
jirl $r0, $r1, 0x0
146+
.align 3
147+
148+
EPILOGUE

kernel/loongarch64/dasum_lsx.S

Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
#define ASSEMBLER
2+
#include "common.h"
3+
#define N $r4
4+
#define X $r5
5+
#define INCX $r6
6+
#define I $r17
7+
#define TEMP $r18
8+
#define t1 $r15
9+
#define t2 $r12
10+
#define t3 $r13
11+
#define t4 $r14
12+
#define VX0 $vr12
13+
#define VX1 $vr13
14+
#define VX2 $vr14
15+
#define VX3 $vr15
16+
#define VT0 $vr23
17+
#define VT1 $vr22
18+
#define res1 $vr16
19+
#define res2 $vr17
20+
#define res0 $vr18
21+
#define neg1 $vr19
22+
23+
PROLOGUE
24+
vxor.v res1, res1, res1
25+
vxor.v res2, res2, res2
26+
vxor.v res0, res0, res0
27+
bge $r0, N, .L999
28+
bge $r0, INCX, .L999
29+
li.d t1, -1
30+
vreplgr2vr.d neg1, t1
31+
vffint.d.l neg1, neg1
32+
li.d TEMP, SIZE
33+
slli.d INCX, INCX, BASE_SHIFT
34+
srai.d I, N, 3
35+
bne INCX, TEMP, .L20
36+
bge $r0, I, .L13
37+
.align 3
38+
39+
.L11:
40+
vld VX0, X, 0 * SIZE
41+
vld VX1, X, 2 * SIZE
42+
vfmul.d VX2, neg1, VX0
43+
vfmul.d VX3, neg1, VX1
44+
vfcmp.clt.d VT0, VX0, res0
45+
vfcmp.clt.d VT1, VX1, res0
46+
vbitsel.v VX0, VX0, VX2, VT0
47+
vbitsel.v VX1, VX1, VX3, VT1
48+
vfadd.d res2, VX0, VX1
49+
vfadd.d res1, res1, res2
50+
vld VX0, X, 4 * SIZE
51+
vld VX1, X, 6 * SIZE
52+
vfmul.d VX2, neg1, VX0
53+
vfmul.d VX3, neg1, VX1
54+
vfcmp.clt.d VT0, VX0, res0
55+
vfcmp.clt.d VT1, VX1, res0
56+
vbitsel.v VX0, VX0, VX2, VT0
57+
vbitsel.v VX1, VX1, VX3, VT1
58+
vfadd.d res2, VX0, VX1
59+
vfadd.d res1, res1, res2
60+
addi.d X, X, 8 * SIZE
61+
addi.d I, I, -1
62+
blt $r0, I, .L11
63+
.align 3
64+
65+
.L12:
66+
vreplvei.d VX1, res1, 1
67+
vfadd.d res1, VX1, res1
68+
.align 3
69+
70+
.L13:
71+
andi I, N, 7
72+
bge $r0, I, .L999
73+
.align 3
74+
75+
.L14:
76+
fld.d $f12, X, 0 * SIZE
77+
fabs.d $f12, $f12
78+
fadd.d $f16, $f12, $f16
79+
addi.d I, I, -1
80+
addi.d X, X, SIZE
81+
blt $r0, I, .L14
82+
b .L999
83+
.align 3
84+
85+
.L20:
86+
bge $r0, I, .L23
87+
.align 3
88+
89+
.L21:
90+
ld.d t1, X, 0 * SIZE
91+
add.d X, X, INCX
92+
ld.d t2, X, 0 * SIZE
93+
add.d X, X, INCX
94+
vinsgr2vr.d VX0, t1, 0
95+
vinsgr2vr.d VX0, t2, 1
96+
ld.d t1, X, 0 * SIZE
97+
add.d X, X, INCX
98+
ld.d t2, X, 0 * SIZE
99+
vinsgr2vr.d VX1, t1, 0
100+
vinsgr2vr.d VX1, t2, 1
101+
add.d X, X, INCX
102+
vfmul.d VX2, neg1, VX0
103+
vfmul.d VX3, neg1, VX1
104+
vfcmp.clt.d VT0, VX0, res0
105+
vfcmp.clt.d VT1, VX1, res0
106+
vbitsel.v VX0, VX0, VX2, VT0
107+
vbitsel.v VX1, VX1, VX3, VT1
108+
vfadd.d res2, VX0, VX1
109+
vfadd.d res1, res1, res2
110+
ld.d t3, X, 0 * SIZE
111+
add.d X, X, INCX
112+
ld.d t4, X, 0 * SIZE
113+
add.d X, X, INCX
114+
vinsgr2vr.d VX0, t3, 0
115+
vinsgr2vr.d VX0, t4, 1
116+
ld.d t3, X, 0 * SIZE
117+
add.d X, X, INCX
118+
ld.d t4, X, 0 * SIZE
119+
vinsgr2vr.d VX1, t3, 0
120+
vinsgr2vr.d VX1, t4, 1
121+
add.d X, X, INCX
122+
vfmul.d VX2, neg1, VX0
123+
vfmul.d VX3, neg1, VX1
124+
vfcmp.clt.d VT0, VX0, res0
125+
vfcmp.clt.d VT1, VX1, res0
126+
vbitsel.v VX0, VX0, VX2, VT0
127+
vbitsel.v VX1, VX1, VX3, VT1
128+
vfadd.d res2, VX0, VX1
129+
vfadd.d res1, res1, res2
130+
addi.d I, I, -1
131+
blt $r0, I, .L21
132+
.align 3
133+
134+
.L22:
135+
vreplvei.d VX1, res1, 1
136+
vfadd.d res1, VX1, res1
137+
.align 3
138+
139+
.L23:
140+
andi I, N, 7
141+
bge $r0, I, .L999
142+
.align 3
143+
144+
.L24:
145+
fld.d $f12, X, 0 * SIZE
146+
fabs.d $f12, $f12
147+
fadd.d $f16, $f12, $f16
148+
addi.d I, I, -1
149+
add.d X, X, INCX
150+
blt $r0, I, .L24
151+
.align 3
152+
153+
.L999:
154+
fmov.d $f0, $f16
155+
jirl $r0, $r1, 0x0
156+
.align 3
157+
158+
EPILOGUE

0 commit comments

Comments
 (0)