Skip to content

Commit 993ede7

Browse files
yanchengyinshiyou
authored andcommitted
loongarch64: Add optimizations for scal.
1 parent 39bf8ec commit 993ede7

File tree

6 files changed

+787
-0
lines changed

6 files changed

+787
-0
lines changed

kernel/loongarch64/KERNEL.LOONGSON2K1000

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,7 @@ SDOTKERNEL = dot_lsx.S
44
DSDOTKERNEL = dot_lsx.S
55
DDOTKERNEL = dot_lsx.S
66

7+
SSCALKERNEL = sscal_lsx.S
8+
DSCALKERNEL = dscal_lsx.S
9+
710
endif

kernel/loongarch64/KERNEL.LOONGSON3R5

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,9 @@ SDOTKERNEL = dot_lasx.S
44
DSDOTKERNEL = dot_lasx.S
55
DDOTKERNEL = dot_lasx.S
66

7+
SSCALKERNEL = sscal_lasx.S
8+
DSCALKERNEL = dscal_lasx.S
9+
710
DGEMMKERNEL = dgemm_kernel_16x4.S
811
DGEMMINCOPY = dgemm_ncopy_16.S
912
DGEMMITCOPY = dgemm_tcopy_16.S

kernel/loongarch64/dscal_lasx.S

Lines changed: 194 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,194 @@
1+
#define ASSEMBLER
2+
#include "common.h"
3+
4+
#define N $r4
5+
#define ALPHA $f0
6+
#define X $r7
7+
#define INCX $r8
8+
#define I $r12
9+
#define TEMP $r13
10+
#define t1 $r14
11+
#define t2 $r18
12+
#define t3 $r15
13+
#define t4 $r17
14+
#define XX $r16
15+
#define VX0 $xr12
16+
#define VX1 $xr13
17+
#define VT0 $xr14
18+
#define VT1 $xr15
19+
#define VALPHA $xr19
20+
#define a1 $f8
21+
#define a2 $f23
22+
23+
PROLOGUE
24+
25+
bge $r0, N, .L999
26+
bge $r0, INCX, .L999
27+
li.d TEMP, 1
28+
movgr2fr.d a1, $r0
29+
ffint.d.l a1, a1
30+
movgr2fr.d a2, TEMP
31+
ffint.d.l a2, a2
32+
slli.d TEMP, TEMP, BASE_SHIFT
33+
slli.d INCX, INCX, BASE_SHIFT
34+
fcmp.ceq.d $fcc0, ALPHA, a1
35+
bcnez $fcc0, .L20 //ALPHA==0
36+
fcmp.ceq.d $fcc0, ALPHA, a2
37+
bcnez $fcc0, .L999 //ALPHA==1 return
38+
srai.d I, N, 3
39+
beq INCX, TEMP, .L30 //ALPHA!=0|1 and INCX==1
40+
movfr2gr.d TEMP, ALPHA
41+
xvreplgr2vr.d VALPHA, TEMP
42+
move XX, X
43+
.align 3
44+
45+
.L10:
46+
bge $r0, I, .L32
47+
.align 3
48+
.L11:
49+
ld.d t1, X, 0 * SIZE
50+
add.d X, X, INCX
51+
ld.d t2, X, 0 * SIZE
52+
add.d X, X, INCX
53+
ld.d t3, X, 0 * SIZE
54+
add.d X, X, INCX
55+
ld.d t4, X, 0 * SIZE
56+
add.d X, X, INCX
57+
xvinsgr2vr.d VX0, t1, 0
58+
xvinsgr2vr.d VX0, t2, 1
59+
xvinsgr2vr.d VX0, t3, 2
60+
xvinsgr2vr.d VX0, t4, 3
61+
ld.d t1, X, 0 * SIZE
62+
add.d X, X, INCX
63+
ld.d t2, X, 0 * SIZE
64+
add.d X, X, INCX
65+
xvfmul.d VT0, VX0, VALPHA
66+
ld.d t3, X, 0 * SIZE
67+
add.d X, X, INCX
68+
ld.d t4, X, 0 * SIZE
69+
add.d X, X, INCX
70+
xvinsgr2vr.d VX1, t1, 0
71+
xvinsgr2vr.d VX1, t2, 1
72+
xvinsgr2vr.d VX1, t3, 2
73+
xvinsgr2vr.d VX1, t4, 3
74+
xvstelm.d VT0, XX, 0, 0
75+
add.d XX, XX, INCX
76+
xvstelm.d VT0, XX, 0, 1
77+
add.d XX, XX, INCX
78+
xvstelm.d VT0, XX, 0, 2
79+
add.d XX, XX, INCX
80+
xvstelm.d VT0, XX, 0, 3
81+
add.d XX, XX, INCX
82+
xvfmul.d VT1, VX1, VALPHA
83+
xvstelm.d VT1, XX, 0, 0
84+
add.d XX, XX, INCX
85+
xvstelm.d VT1, XX, 0, 1
86+
add.d XX, XX, INCX
87+
xvstelm.d VT1, XX, 0, 2
88+
add.d XX, XX, INCX
89+
xvstelm.d VT1, XX, 0, 3
90+
add.d XX, XX, INCX
91+
addi.d I, I, -1
92+
blt $r0, I, .L11
93+
b .L32
94+
.align 3
95+
96+
.L20:
97+
srai.d I, N, 3
98+
beq INCX, TEMP, .L24
99+
bge $r0, I, .L22
100+
.align 3
101+
102+
.L21:
103+
fst.d a1, X, 0
104+
add.d X, X, INCX
105+
fst.d a1, X, 0
106+
add.d X, X, INCX
107+
fst.d a1, X, 0
108+
add.d X, X, INCX
109+
fst.d a1, X, 0
110+
add.d X, X, INCX
111+
fst.d a1, X, 0
112+
add.d X, X, INCX
113+
fst.d a1, X, 0
114+
add.d X, X, INCX
115+
fst.d a1, X, 0
116+
add.d X, X, INCX
117+
fst.d a1, X, 0
118+
add.d X, X, INCX
119+
addi.d I, I, -1
120+
blt $r0, I, .L21
121+
.align 3
122+
123+
.L22:
124+
andi I, N, 7
125+
bge $r0, I, .L999
126+
.align 3
127+
.L23:
128+
fst.d a1, X, 0 * SIZE
129+
addi.d I, I, -1
130+
add.d X, X, INCX
131+
blt $r0, I, .L23
132+
jirl $r0, $r1, 0
133+
.align 3
134+
135+
.L24:
136+
bge $r0, I, .L26 /*N<8 INCX==1*/
137+
.align 3
138+
.L25:
139+
xvxor.v VX0, VX0, VX0
140+
xvst VX0, X, 0 * SIZE
141+
xvst VX0, X, 4 * SIZE
142+
addi.d I, I, -1
143+
addi.d X, X, 8 * SIZE
144+
blt $r0, I, .L25
145+
.align 3
146+
147+
.L26:
148+
andi I, N, 7
149+
bge $r0, I, .L999
150+
.align 3
151+
.L27:
152+
fst.d a1, X, 0 * SIZE
153+
addi.d I, I, -1
154+
addi.d X, X, SIZE
155+
blt $r0, I, .L27
156+
jirl $r0, $r1, 0
157+
.align 3
158+
159+
.L30:
160+
bge $r0, I, .L32/*N<8 INCX==1*/
161+
movfr2gr.d TEMP, ALPHA
162+
xvreplgr2vr.d VALPHA , TEMP
163+
.align 3
164+
165+
.L31:
166+
xvld VX0, X, 0 * SIZE
167+
xvld VX1, X, 4 * SIZE
168+
xvfmul.d VT0, VX0, VALPHA
169+
xvfmul.d VT1, VX1, VALPHA
170+
addi.d I, I, -1
171+
xvst VT0, X, 0 * SIZE
172+
xvst VT1, X, 4 * SIZE
173+
addi.d X, X, 8 * SIZE
174+
blt $r0, I, .L31
175+
.align 3
176+
177+
.L32:
178+
andi I, N, 7
179+
bge $r0, I, .L999
180+
.align 3
181+
.L33:
182+
fld.d a1, X, 0 * SIZE
183+
addi.d I, I, -1
184+
fmul.d a1, ALPHA, a1
185+
fst.d a1, X, 0 * SIZE
186+
add.d X, X, INCX
187+
blt $r0, I, .L33
188+
jirl $r0, $r1, 0
189+
.align 3
190+
191+
.L999:
192+
jirl $r0, $r1, 0x0
193+
194+
EPILOGUE

0 commit comments

Comments
 (0)