Skip to content

Commit efa9515

Browse files
authored
Merge branch 'OpenMathLib:develop' into win_perf
2 parents edac80d + 5b09833 commit efa9515

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

71 files changed

+19999
-1
lines changed

GotoBLAS_06WeirdPerformance.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
operation is finished.
1212

1313

14-
2. Simlar problem may happen under virtual machine. If supervisor
14+
2. Similar problem may happen under virtual machine. If supervisor
1515
allocates different cores for each scheduling, BLAS performnace
1616
will be bad. This is because BLAS also utilizes all cache,
1717
unexpected re-schedule for different core may result of heavy

kernel/loongarch64/KERNEL.LOONGSON2K1000

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,55 @@ SDOTKERNEL = dot_lsx.S
44
DSDOTKERNEL = dot_lsx.S
55
DDOTKERNEL = dot_lsx.S
66

7+
SSCALKERNEL = sscal_lsx.S
8+
DSCALKERNEL = dscal_lsx.S
9+
10+
SAMAXKERNEL = samax_lsx.S
11+
DAMAXKERNEL = damax_lsx.S
12+
13+
SAMINKERNEL = samin_lsx.S
14+
DAMINKERNEL = damin_lsx.S
15+
16+
SMAXKERNEL = smax_lsx.S
17+
DMAXKERNEL = dmax_lsx.S
18+
19+
SMINKERNEL = smin_lsx.S
20+
DMINKERNEL = dmin_lsx.S
21+
22+
ISMAXKERNEL = ismax_lsx.S
23+
IDMAXKERNEL = idmax_lsx.S
24+
25+
ISMINKERNEL = ismin_lsx.S
26+
IDMINKERNEL = idmin_lsx.S
27+
28+
ISAMAXKERNEL = isamax_lsx.S
29+
IDAMAXKERNEL = idamax_lsx.S
30+
31+
ISAMINKERNEL = isamin_lsx.S
32+
IDAMINKERNEL = idamin_lsx.S
33+
34+
SCOPYKERNEL = scopy_lsx.S
35+
DCOPYKERNEL = dcopy_lsx.S
36+
37+
SSWAPKERNEL = sswap_lsx.S
38+
DSWAPKERNEL = dswap_lsx.S
39+
40+
SAXPYKERNEL = saxpy_lsx.S
41+
DAXPYKERNEL = daxpy_lsx.S
42+
43+
SAXPBYKERNEL = saxpby_lsx.S
44+
DAXPBYKERNEL = daxpby_lsx.S
45+
46+
SSUMKERNEL = ssum_lsx.S
47+
DSUMKERNEL = dsum_lsx.S
48+
49+
SASUMKERNEL = sasum_lsx.S
50+
DASUMKERNEL = dasum_lsx.S
51+
52+
SROTKERNEL = srot_lsx.S
53+
DROTKERNEL = drot_lsx.S
54+
55+
SNRM2KERNEL = snrm2_lsx.S
56+
DNRM2KERNEL = dnrm2_lsx.S
57+
758
endif

kernel/loongarch64/KERNEL.LOONGSON3R5

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,57 @@ SDOTKERNEL = dot_lasx.S
44
DSDOTKERNEL = dot_lasx.S
55
DDOTKERNEL = dot_lasx.S
66

7+
SSCALKERNEL = sscal_lasx.S
8+
DSCALKERNEL = dscal_lasx.S
9+
10+
SAMAXKERNEL = samax_lasx.S
11+
DAMAXKERNEL = damax_lasx.S
12+
13+
SAMINKERNEL = samin_lasx.S
14+
DAMINKERNEL = damin_lasx.S
15+
16+
SMAXKERNEL = smax_lasx.S
17+
DMAXKERNEL = dmax_lasx.S
18+
19+
SMINKERNEL = smin_lasx.S
20+
DMINKERNEL = dmin_lasx.S
21+
22+
ISMAXKERNEL = ismax_lasx.S
23+
IDMAXKERNEL = idmax_lasx.S
24+
25+
ISMINKERNEL = ismin_lasx.S
26+
IDMINKERNEL = idmin_lasx.S
27+
28+
ISAMAXKERNEL = isamax_lasx.S
29+
IDAMAXKERNEL = idamax_lasx.S
30+
31+
ISAMINKERNEL = isamin_lasx.S
32+
IDAMINKERNEL = idamin_lasx.S
33+
34+
SCOPYKERNEL = scopy_lasx.S
35+
DCOPYKERNEL = dcopy_lasx.S
36+
37+
SSWAPKERNEL = sswap_lasx.S
38+
DSWAPKERNEL = dswap_lasx.S
39+
40+
SAXPYKERNEL = saxpy_lasx.S
41+
DAXPYKERNEL = daxpy_lasx.S
42+
43+
SAXPBYKERNEL = saxpby_lasx.S
44+
DAXPBYKERNEL = daxpby_lasx.S
45+
46+
SSUMKERNEL = ssum_lasx.S
47+
DSUMKERNEL = dsum_lasx.S
48+
49+
SASUMKERNEL = sasum_lasx.S
50+
DASUMKERNEL = dasum_lasx.S
51+
52+
SROTKERNEL = srot_lasx.S
53+
DROTKERNEL = drot_lasx.S
54+
55+
SNRM2KERNEL = snrm2_lasx.S
56+
DNRM2KERNEL = dnrm2_lasx.S
57+
758
DGEMMKERNEL = dgemm_kernel_16x4.S
859
DGEMMINCOPY = dgemm_ncopy_16.S
960
DGEMMITCOPY = dgemm_tcopy_16.S

kernel/loongarch64/damax_lasx.S

Lines changed: 183 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,183 @@
1+
#define ASSEMBLER
2+
3+
#include "common.h"
4+
5+
#define N $r4
6+
#define X $r5
7+
#define INCX $r6
8+
#define I $r12
9+
#define J $r13
10+
#define t1 $r14
11+
#define t2 $r18
12+
#define t3 $r15
13+
#define t4 $r17
14+
#define TEMP $r16
15+
#define m0 $xr8
16+
#define x1 $xr9
17+
#define x2 $xr10
18+
#define x3 $xr11
19+
#define x4 $xr12
20+
#define x5 $xr13
21+
#define x6 $xr14
22+
#define x7 $xr15
23+
#define x8 $xr16
24+
#define VX0 $xr20
25+
#define VX1 $xr21
26+
#define VM0 $xr22
27+
#define VM1 $xr23
28+
#define VM2 $xr18
29+
#define VM3 $xr19
30+
31+
PROLOGUE
32+
33+
bge $r0, N, .L999
34+
bge $r0, INCX, .L999
35+
li.d TEMP, 1
36+
slli.d TEMP, TEMP, BASE_SHIFT
37+
slli.d INCX, INCX, BASE_SHIFT
38+
bne INCX, TEMP, .L20
39+
xvld VM0, X, 0
40+
srai.d I, N, 3
41+
bge $r0, I, .L12
42+
.align 3
43+
44+
.L10:
45+
xvld VX0, X, 0 * SIZE
46+
xvld VX1, X, 4 * SIZE
47+
addi.d I, I, -1
48+
xvfmaxa.d VM1, VX1, VX0
49+
addi.d X, X, 8 * SIZE
50+
xvfmaxa.d VM0, VM0, VM1
51+
blt $r0, I, .L10
52+
.align 3
53+
54+
.L11:
55+
xvpickve.d x1, VM0, 0
56+
xvpickve.d x2, VM0, 1
57+
xvpickve.d x3, VM0, 2
58+
xvpickve.d x4, VM0, 3
59+
xvfmaxa.d VM1, x1, x2
60+
xvfmaxa.d VM2, x3, x4
61+
xvfmaxa.d VM0, VM1, VM2
62+
.align 3
63+
64+
.L12: //INCX==1 and N<8
65+
andi I, N, 7
66+
li.d J, 4
67+
bge J, I, .L13 // 4<N<8
68+
xvld VX0, X, 0
69+
slli.d J, J, 1 // 8
70+
sub.d I, J, I
71+
slli.d I, I, BASE_SHIFT
72+
xvldx VX1, X, I
73+
xvfmaxa.d m0, VX0, VX1 //patial repeat read
74+
xvpickve.d x1, m0, 0
75+
xvpickve.d x2, m0, 1
76+
xvpickve.d x3, m0, 2
77+
xvpickve.d x4, m0, 3
78+
xvfmaxa.d VM1, x1, x2
79+
xvfmaxa.d m0, x3, x4
80+
xvfmaxa.d m0, m0, VM1
81+
xvfmaxa.d VM0, m0, VM0
82+
fabs.d $f22, $f22
83+
fmov.d $f0, $f22
84+
jirl $r0, $r1, 0x0
85+
.align 3
86+
87+
.L13: //INCX==1 and 0<=N<=4
88+
bge $r0, I, .L15
89+
.align 3
90+
91+
.L14:
92+
xvld x1, X, 0
93+
addi.d I, I, -1
94+
xvfmaxa.d VM0, VM0, x1
95+
addi.d X, X, SIZE
96+
blt $r0, I, .L14
97+
.align 3
98+
99+
.L15:
100+
fabs.d $f22, $f22
101+
fmov.d $f0, $f22
102+
jirl $r0, $r1, 0x0
103+
.align 3
104+
105+
.L20: // INCX!=1
106+
move TEMP, X // initialize the maxa value
107+
ld.d t1, TEMP, 0 * SIZE
108+
add.d TEMP, TEMP, INCX
109+
xvinsgr2vr.d VM0, t1, 0
110+
srai.d I, N, 3
111+
bge $r0, I, .L23
112+
ld.d t2, TEMP, 0 * SIZE
113+
add.d TEMP, TEMP, INCX
114+
ld.d t3, TEMP, 0 * SIZE
115+
add.d TEMP, TEMP, INCX
116+
ld.d t4, TEMP, 0 * SIZE
117+
add.d TEMP, TEMP, INCX
118+
xvinsgr2vr.d VM0, t2, 1
119+
xvinsgr2vr.d VM0, t3, 2
120+
xvinsgr2vr.d VM0, t4, 3
121+
.align 3
122+
123+
.L21:
124+
ld.d t1, X, 0 * SIZE
125+
add.d X, X, INCX
126+
ld.d t2, X, 0 * SIZE
127+
add.d X, X, INCX
128+
ld.d t3, X, 0 * SIZE
129+
add.d X, X, INCX
130+
ld.d t4, X, 0 * SIZE
131+
add.d X, X, INCX
132+
xvinsgr2vr.d VX0, t1, 0
133+
xvinsgr2vr.d VX0, t2, 1
134+
xvinsgr2vr.d VX0, t3, 2
135+
xvinsgr2vr.d VX0, t4, 3
136+
ld.d t1, X, 0 * SIZE
137+
add.d X, X, INCX
138+
ld.d t2, X, 0 * SIZE
139+
add.d X, X, INCX
140+
ld.d t3, X, 0 * SIZE
141+
add.d X, X, INCX
142+
ld.d t4, X, 0 * SIZE
143+
add.d X, X, INCX
144+
xvinsgr2vr.d VX1, t1, 0
145+
xvinsgr2vr.d VX1, t2, 1
146+
xvinsgr2vr.d VX1, t3, 2
147+
xvinsgr2vr.d VX1, t4, 3
148+
addi.d I, I, -1
149+
xvfmaxa.d VM1, VX1, VX0
150+
xvfmaxa.d VM0, VM1, VM0
151+
blt $r0, I, .L21
152+
.align 3
153+
154+
.L22:
155+
xvpickve.d x1, VM0, 0
156+
xvpickve.d x2, VM0, 1
157+
xvpickve.d x3, VM0, 2
158+
xvpickve.d x4, VM0, 3
159+
xvfmaxa.d VM1, x1, x2
160+
xvfmaxa.d VM2, x3, x4
161+
xvfmaxa.d VM0, VM1, VM2
162+
.align 3
163+
164+
.L23: //INCX!=1 and N<8
165+
andi I, N, 7
166+
bge $r0, I, .L999
167+
.align 3
168+
169+
.L24:
170+
xvld x1, X, 0
171+
addi.d I, I, -1
172+
xvfmaxa.d VM0, VM0, x1
173+
add.d X, X, INCX
174+
blt $r0, I, .L24
175+
.align 3
176+
177+
.L999:
178+
fabs.d $f22, $f22
179+
fmov.d $f0, $f22
180+
jirl $r0, $r1, 0x0
181+
.align 3
182+
183+
EPILOGUE

0 commit comments

Comments
 (0)