Skip to content

Commit 519b40f

Browse files
authored
Merge pull request #4398 from yinshiyou/la-dev
Add Optimizations for LoongArch.
2 parents 21564bd + a5d0d21 commit 519b40f

File tree

136 files changed

+32598
-14902
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

136 files changed

+32598
-14902
lines changed

common_loongarch64.h

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,19 +119,47 @@ static inline int WhereAmI(void){
119119
#define MOV fmov.d
120120
#define CMOVT fsel
121121
#define MTC movgr2fr.d
122+
#define MTG movfr2gr.d
122123
#define FABS fabs.d
124+
#define FMIN fmin.d
125+
#define FMINA fmina.d
126+
#define FMAX fmax.d
127+
#define FMAXA fmaxa.d
123128
#define CMPEQ fcmp.ceq.d
124129
#define CMPLE fcmp.cle.d
125130
#define CMPLT fcmp.clt.d
126131
#define NEG fneg.d
132+
#define FFINT ffint.d.l
127133

128134
#define XVFSUB xvfsub.d
129135
#define XVFADD xvfadd.d
136+
#define XVFMUL xvfmul.d
130137
#define XVFMADD xvfmadd.d
138+
#define XVFMIN xvfmin.d
139+
#define XVFMINA xvfmina.d
140+
#define XVFMAX xvfmax.d
141+
#define XVFMAXA xvfmaxa.d
142+
#define XVCMPEQ xvfcmp.ceq.d
143+
#define XVCMPLE xvfcmp.cle.d
144+
#define XVCMPLT xvfcmp.clt.d
145+
#define XVMUL xvfmul.d
146+
#define XVMSUB xvfmsub.d
147+
#define XVNMSUB xvfnmsub.d
131148

132149
#define VFSUB vfsub.d
133150
#define VFADD vfadd.d
151+
#define VFMUL vfmul.d
134152
#define VFMADD vfmadd.d
153+
#define VFMIN vfmin.d
154+
#define VFMINA vfmina.d
155+
#define VFMAX vfmax.d
156+
#define VFMAXA vfmaxa.d
157+
#define VCMPEQ vfcmp.ceq.d
158+
#define VCMPLE vfcmp.cle.d
159+
#define VCMPLT vfcmp.clt.d
160+
#define VMUL vfmul.d
161+
#define VMSUB vfmsub.d
162+
#define VNMSUB vfnmsub.d
135163

136164
#else
137165

@@ -147,19 +175,47 @@ static inline int WhereAmI(void){
147175
#define MOV fmov.s
148176
#define CMOVT fsel
149177
#define MTC movgr2fr.w
178+
#define MTG movfr2gr.s
150179
#define FABS fabs.s
180+
#define FMIN fmin.s
181+
#define FMINA fmina.s
182+
#define FMAX fmax.s
183+
#define FMAXA fmaxa.s
151184
#define CMPEQ fcmp.ceq.s
152185
#define CMPLE fcmp.cle.s
153186
#define CMPLT fcmp.clt.s
154187
#define NEG fneg.s
188+
#define FFINT ffint.s.l
155189

156190
#define XVFSUB xvfsub.s
157191
#define XVFADD xvfadd.s
192+
#define XVFMUL xvfmul.s
158193
#define XVFMADD xvfmadd.s
194+
#define XVFMIN xvfmin.s
195+
#define XVFMINA xvfmina.s
196+
#define XVFMAX xvfmax.s
197+
#define XVFMAXA xvfmaxa.s
198+
#define XVCMPEQ xvfcmp.ceq.s
199+
#define XVCMPLE xvfcmp.cle.s
200+
#define XVCMPLT xvfcmp.clt.s
201+
#define XVMUL xvfmul.s
202+
#define XVMSUB xvfmsub.s
203+
#define XVNMSUB xvfnmsub.s
159204

160205
#define VFSUB vfsub.s
161206
#define VFADD vfadd.s
207+
#define VFMUL vfmul.s
162208
#define VFMADD vfmadd.s
209+
#define VFMIN vfmin.s
210+
#define VFMINA vfmina.s
211+
#define VFMAX vfmax.s
212+
#define VFMAXA vfmaxa.s
213+
#define VCMPEQ vfcmp.ceq.s
214+
#define VCMPLE vfcmp.cle.s
215+
#define VCMPLT vfcmp.clt.s
216+
#define VMUL vfmul.s
217+
#define VMSUB vfmsub.s
218+
#define VNMSUB vfnmsub.s
163219

164220
#endif /* defined(DOUBLE) */
165221

kernel/loongarch64/KERNEL.LOONGSON2K1000

Lines changed: 84 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -3,56 +3,108 @@ ifndef NO_LSX
33
SDOTKERNEL = dot_lsx.S
44
DSDOTKERNEL = dot_lsx.S
55
DDOTKERNEL = dot_lsx.S
6+
CDOTKERNEL = cdot_lsx.S
7+
ZDOTKERNEL = cdot_lsx.S
68

7-
SSCALKERNEL = sscal_lsx.S
8-
DSCALKERNEL = dscal_lsx.S
9+
SSCALKERNEL = scal_lsx.S
10+
DSCALKERNEL = scal_lsx.S
11+
CSCALKERNEL = cscal_lsx.S
12+
ZSCALKERNEL = cscal_lsx.S
913

10-
SAMAXKERNEL = samax_lsx.S
11-
DAMAXKERNEL = damax_lsx.S
14+
SAMAXKERNEL = amax_lsx.S
15+
DAMAXKERNEL = amax_lsx.S
16+
CAMAXKERNEL = camax_lsx.S
1217

13-
SAMINKERNEL = samin_lsx.S
14-
DAMINKERNEL = damin_lsx.S
18+
SAMINKERNEL = amin_lsx.S
19+
DAMINKERNEL = amin_lsx.S
20+
CAMINKERNEL = camin_lsx.S
1521

16-
SMAXKERNEL = smax_lsx.S
17-
DMAXKERNEL = dmax_lsx.S
22+
SMAXKERNEL = max_lsx.S
23+
DMAXKERNEL = max_lsx.S
1824

19-
SMINKERNEL = smin_lsx.S
20-
DMINKERNEL = dmin_lsx.S
25+
SMINKERNEL = min_lsx.S
26+
DMINKERNEL = min_lsx.S
2127

22-
ISMAXKERNEL = ismax_lsx.S
23-
IDMAXKERNEL = idmax_lsx.S
28+
ISMAXKERNEL = imax_lsx.S
29+
IDMAXKERNEL = imax_lsx.S
2430

25-
ISMINKERNEL = ismin_lsx.S
26-
IDMINKERNEL = idmin_lsx.S
31+
ISMINKERNEL = imin_lsx.S
32+
IDMINKERNEL = imin_lsx.S
2733

28-
ISAMAXKERNEL = isamax_lsx.S
29-
IDAMAXKERNEL = idamax_lsx.S
34+
ISAMAXKERNEL = iamax_lsx.S
35+
IDAMAXKERNEL = iamax_lsx.S
36+
ICAMAXKERNEL = icamax_lsx.S
37+
IZAMAXKERNEL = icamax_lsx.S
3038

31-
ISAMINKERNEL = isamin_lsx.S
32-
IDAMINKERNEL = idamin_lsx.S
39+
ISAMINKERNEL = iamin_lsx.S
40+
IDAMINKERNEL = iamin_lsx.S
41+
ICAMINKERNEL = icamin_lsx.S
42+
IZAMINKERNEL = icamin_lsx.S
3343

34-
SCOPYKERNEL = scopy_lsx.S
35-
DCOPYKERNEL = dcopy_lsx.S
44+
SCOPYKERNEL = copy_lsx.S
45+
DCOPYKERNEL = copy_lsx.S
46+
CCOPYKERNEL = ccopy_lsx.S
47+
ZCOPYKERNEL = ccopy_lsx.S
3648

37-
SSWAPKERNEL = sswap_lsx.S
38-
DSWAPKERNEL = dswap_lsx.S
49+
SSWAPKERNEL = swap_lsx.S
50+
DSWAPKERNEL = swap_lsx.S
3951

40-
SAXPYKERNEL = saxpy_lsx.S
41-
DAXPYKERNEL = daxpy_lsx.S
52+
SAXPYKERNEL = axpy_lsx.S
53+
DAXPYKERNEL = axpy_lsx.S
54+
CAXPYKERNEL = caxpy_lsx.S
55+
ZAXPYKERNEL = caxpy_lsx.S
4256

43-
SAXPBYKERNEL = saxpby_lsx.S
44-
DAXPBYKERNEL = daxpby_lsx.S
57+
SAXPBYKERNEL = axpby_lsx.S
58+
DAXPBYKERNEL = axpby_lsx.S
4559

46-
SSUMKERNEL = ssum_lsx.S
47-
DSUMKERNEL = dsum_lsx.S
60+
SSUMKERNEL = sum_lsx.S
61+
DSUMKERNEL = sum_lsx.S
4862

49-
SASUMKERNEL = sasum_lsx.S
50-
DASUMKERNEL = dasum_lsx.S
63+
SASUMKERNEL = asum_lsx.S
64+
DASUMKERNEL = asum_lsx.S
65+
CASUMKERNEL = casum_lsx.S
66+
ZASUMKERNEL = casum_lsx.S
5167

52-
SROTKERNEL = srot_lsx.S
53-
DROTKERNEL = drot_lsx.S
68+
SROTKERNEL = rot_lsx.S
69+
DROTKERNEL = rot_lsx.S
70+
CROTKERNEL = crot_lsx.S
71+
ZROTKERNEL = crot_lsx.S
5472

5573
SNRM2KERNEL = snrm2_lsx.S
5674
DNRM2KERNEL = dnrm2_lsx.S
75+
CNRM2KERNEL = cnrm2_lsx.S
76+
ZNRM2KERNEL = znrm2_lsx.S
77+
78+
CSWAPKERNEL = cswap_lsx.S
79+
ZSWAPKERNEL = cswap_lsx.S
80+
81+
CSUMKERNEL = csum_lsx.S
82+
ZSUMKERNEL = csum_lsx.S
83+
84+
DGEMMKERNEL = dgemm_kernel_8x4.S
85+
DGEMMINCOPY = dgemm_ncopy_8_lsx.S
86+
DGEMMITCOPY = dgemm_tcopy_8_lsx.S
87+
DGEMMONCOPY = dgemm_ncopy_4_lsx.S
88+
DGEMMOTCOPY = dgemm_tcopy_4_lsx.S
89+
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
90+
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
91+
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
92+
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
93+
94+
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
95+
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
96+
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
97+
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
98+
99+
CGEMMKERNEL = cgemm_kernel_2x2_lsx.S
100+
CGEMMONCOPY = cgemm_ncopy_2_lsx.S
101+
CGEMMOTCOPY = cgemm_tcopy_2_lsx.S
102+
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
103+
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
104+
105+
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
106+
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
107+
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
108+
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
57109

58110
endif

kernel/loongarch64/KERNEL.LOONGSON3R5

Lines changed: 81 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -3,57 +3,83 @@ ifndef NO_LASX
33
SDOTKERNEL = dot_lasx.S
44
DSDOTKERNEL = dot_lasx.S
55
DDOTKERNEL = dot_lasx.S
6+
CDOTKERNEL = cdot_lasx.S
7+
ZDOTKERNEL = cdot_lasx.S
68

7-
SSCALKERNEL = sscal_lasx.S
8-
DSCALKERNEL = dscal_lasx.S
9+
SSCALKERNEL = scal_lasx.S
10+
DSCALKERNEL = scal_lasx.S
11+
CSCALKERNEL = cscal_lasx.S
12+
ZSCALKERNEL = cscal_lasx.S
913

10-
SAMAXKERNEL = samax_lasx.S
11-
DAMAXKERNEL = damax_lasx.S
14+
SAMAXKERNEL = amax_lasx.S
15+
DAMAXKERNEL = amax_lasx.S
16+
CAMAXKERNEL = camax_lasx.S
1217

13-
SAMINKERNEL = samin_lasx.S
14-
DAMINKERNEL = damin_lasx.S
18+
SAMINKERNEL = amin_lasx.S
19+
DAMINKERNEL = amin_lasx.S
20+
CAMINKERNEL = camin_lasx.S
1521

16-
SMAXKERNEL = smax_lasx.S
17-
DMAXKERNEL = dmax_lasx.S
22+
SMAXKERNEL = max_lsx.S
23+
DMAXKERNEL = max_lsx.S
1824

19-
SMINKERNEL = smin_lasx.S
20-
DMINKERNEL = dmin_lasx.S
25+
SMINKERNEL = min_lsx.S
26+
DMINKERNEL = min_lsx.S
2127

22-
ISMAXKERNEL = ismax_lasx.S
23-
IDMAXKERNEL = idmax_lasx.S
28+
ISMAXKERNEL = imax_lasx.S
29+
IDMAXKERNEL = imax_lasx.S
2430

25-
ISMINKERNEL = ismin_lasx.S
26-
IDMINKERNEL = idmin_lasx.S
31+
ISMINKERNEL = imin_lasx.S
32+
IDMINKERNEL = imin_lasx.S
2733

28-
ISAMAXKERNEL = isamax_lasx.S
29-
IDAMAXKERNEL = idamax_lasx.S
34+
ISAMAXKERNEL = iamax_lasx.S
35+
IDAMAXKERNEL = iamax_lasx.S
36+
ICAMAXKERNEL = icamax_lasx.S
37+
IZAMAXKERNEL = icamax_lasx.S
3038

31-
ISAMINKERNEL = isamin_lasx.S
32-
IDAMINKERNEL = idamin_lasx.S
39+
ISAMINKERNEL = iamin_lasx.S
40+
IDAMINKERNEL = iamin_lasx.S
41+
ICAMINKERNEL = icamin_lasx.S
42+
IZAMINKERNEL = icamin_lasx.S
3343

34-
SCOPYKERNEL = scopy_lasx.S
35-
DCOPYKERNEL = dcopy_lasx.S
44+
SCOPYKERNEL = copy_lasx.S
45+
DCOPYKERNEL = copy_lasx.S
46+
CCOPYKERNEL = ccopy_lasx.S
47+
ZCOPYKERNEL = ccopy_lasx.S
3648

37-
SSWAPKERNEL = sswap_lasx.S
38-
DSWAPKERNEL = dswap_lasx.S
49+
SSWAPKERNEL = swap_lasx.S
50+
DSWAPKERNEL = swap_lasx.S
3951

40-
SAXPYKERNEL = saxpy_lasx.S
41-
DAXPYKERNEL = daxpy_lasx.S
52+
SAXPYKERNEL = axpy_lasx.S
53+
DAXPYKERNEL = axpy_lasx.S
54+
CAXPYKERNEL = caxpy_lasx.S
55+
ZAXPYKERNEL = caxpy_lasx.S
4256

43-
SAXPBYKERNEL = saxpby_lasx.S
44-
DAXPBYKERNEL = daxpby_lasx.S
57+
SAXPBYKERNEL = axpby_lasx.S
58+
DAXPBYKERNEL = axpby_lasx.S
4559

46-
SSUMKERNEL = ssum_lasx.S
47-
DSUMKERNEL = dsum_lasx.S
60+
SSUMKERNEL = sum_lasx.S
61+
DSUMKERNEL = sum_lasx.S
4862

49-
SASUMKERNEL = sasum_lasx.S
50-
DASUMKERNEL = dasum_lasx.S
63+
SASUMKERNEL = asum_lasx.S
64+
DASUMKERNEL = asum_lasx.S
65+
CASUMKERNEL = casum_lasx.S
66+
ZASUMKERNEL = casum_lasx.S
5167

52-
SROTKERNEL = srot_lasx.S
53-
DROTKERNEL = drot_lasx.S
68+
SROTKERNEL = rot_lasx.S
69+
DROTKERNEL = rot_lasx.S
70+
CROTKERNEL = crot_lasx.S
71+
ZROTKERNEL = crot_lasx.S
5472

5573
SNRM2KERNEL = snrm2_lasx.S
5674
DNRM2KERNEL = dnrm2_lasx.S
75+
CNRM2KERNEL = cnrm2_lasx.S
76+
ZNRM2KERNEL = znrm2_lasx.S
77+
78+
CSWAPKERNEL = cswap_lasx.S
79+
ZSWAPKERNEL = cswap_lasx.S
80+
81+
CSUMKERNEL = csum_lasx.S
82+
ZSUMKERNEL = csum_lasx.S
5783

5884
DGEMMKERNEL = dgemm_kernel_16x4.S
5985
DGEMMINCOPY = dgemm_ncopy_16.S
@@ -81,13 +107,35 @@ SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
81107
SGEMVNKERNEL = sgemv_n_8_lasx.S
82108
SGEMVTKERNEL = sgemv_t_8_lasx.S
83109

110+
CGEMMKERNEL = cgemm_kernel_2x2_lsx.S
111+
CGEMMONCOPY = cgemm_ncopy_2_lsx.S
112+
CGEMMOTCOPY = cgemm_tcopy_2_lsx.S
113+
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
114+
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
115+
116+
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
117+
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
118+
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
119+
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
120+
121+
ZGEMMKERNEL = zgemm_kernel_2x2_lasx.S
122+
ZGEMMONCOPY = zgemm_ncopy_2_lasx.S
123+
ZGEMMOTCOPY = zgemm_tcopy_2_lasx.S
124+
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
125+
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
126+
127+
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
128+
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
129+
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
130+
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
131+
84132
DTRSMKERNEL_LN = dtrsm_kernel_LN_16x4_lasx.S
85133
DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_lasx.S
86134
DTRSMKERNEL_RN = dtrsm_kernel_RN_16x4_lasx.S
87135
DTRSMKERNEL_RT = dtrsm_kernel_RT_16x4_lasx.S
88-
endif
89136

90137
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
91138
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
92139
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
93140
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
141+
endif

0 commit comments

Comments
 (0)