Skip to content

Commit d32f38f

Browse files
yanchengyinshiyou
authored andcommitted
loongarch64: Add optimizations for nrm2.
1 parent f9b4689 commit d32f38f

File tree

6 files changed

+780
-0
lines changed

6 files changed

+780
-0
lines changed

kernel/loongarch64/KERNEL.LOONGSON2K1000

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,4 +52,7 @@ DASUMKERNEL = dasum_lsx.S
5252
SROTKERNEL = srot_lsx.S
5353
DROTKERNEL = drot_lsx.S
5454

55+
SNRM2KERNEL = snrm2_lsx.S
56+
DNRM2KERNEL = dnrm2_lsx.S
57+
5558
endif

kernel/loongarch64/KERNEL.LOONGSON3R5

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,9 @@ DASUMKERNEL = dasum_lasx.S
5252
SROTKERNEL = srot_lasx.S
5353
DROTKERNEL = drot_lasx.S
5454

55+
SNRM2KERNEL = snrm2_lasx.S
56+
DNRM2KERNEL = dnrm2_lasx.S
57+
5558
DGEMMKERNEL = dgemm_kernel_16x4.S
5659
DGEMMINCOPY = dgemm_ncopy_16.S
5760
DGEMMITCOPY = dgemm_tcopy_16.S

kernel/loongarch64/dnrm2_lasx.S

Lines changed: 233 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,233 @@
1+
#define ASSEMBLER
2+
3+
#include "common.h"
4+
5+
#define N $r4
6+
#define X $r5
7+
#define INCX $r6
8+
#define XX $r19
9+
#define I $r17
10+
#define TEMP $r18
11+
#define t1 $r12
12+
#define t2 $r13
13+
#define t3 $r14
14+
#define t4 $r15
15+
#define VX0 $xr15
16+
#define VX1 $xr16
17+
#define VM0 $xr17
18+
#define VM1 $xr18
19+
#define VM2 $xr13
20+
#define VM3 $xr14
21+
#define res1 $xr19
22+
#define res2 $xr20
23+
#define VALPHA $xr21
24+
#define INF $f23
25+
#define a1 $f22
26+
#define max $f17
27+
#define ALPHA $f12
28+
29+
PROLOGUE
30+
31+
#ifdef F_INTERFACE
32+
LDINT N, 0(N)
33+
LDINT INCX, 0(INCX)
34+
#endif
35+
36+
xvxor.v res1, res1, res1
37+
xvxor.v res2, res2, res2
38+
bge $r0, N, .L999
39+
beq $r0, INCX, .L999
40+
move XX, X
41+
// Init INF
42+
addi.d TEMP, $r0, 0x7FF
43+
slli.d TEMP, TEMP, 52
44+
MTC INF, TEMP
45+
li.d TEMP, SIZE
46+
slli.d INCX, INCX, BASE_SHIFT
47+
srai.d I, N, 3
48+
bne INCX, TEMP, .L20
49+
xvld VM0, X, 0
50+
bge $r0, I, .L97
51+
.align 3
52+
53+
.L10:
54+
xvld VX0, X, 0 * SIZE
55+
xvld VX1, X, 4 * SIZE
56+
xvfmaxa.d VM1, VX1, VX0
57+
xvfmaxa.d VM0, VM0, VM1
58+
addi.d I, I, -1
59+
addi.d X, X, 8 * SIZE
60+
blt $r0, I, .L10
61+
b .L96
62+
.align 3
63+
64+
.L20: // INCX!=1
65+
move TEMP, X // initialize the maxa value
66+
ld.d t1, TEMP, 0 * SIZE
67+
add.d TEMP, TEMP, INCX
68+
xvinsgr2vr.d VM0, t1, 0
69+
srai.d I, N, 3
70+
bge $r0, I, .L97
71+
ld.d t2, TEMP, 0 * SIZE
72+
add.d TEMP, TEMP, INCX
73+
xvinsgr2vr.d VM0, t2, 1
74+
.align 3
75+
76+
.L21:
77+
ld.d t1, X, 0 * SIZE
78+
add.d X, X, INCX
79+
xvinsgr2vr.d VX0, t1, 0
80+
ld.d t2, X, 0 * SIZE
81+
add.d X, X, INCX
82+
xvinsgr2vr.d VX0, t2, 1
83+
ld.d t3, X, 0 * SIZE
84+
add.d X, X, INCX
85+
xvinsgr2vr.d VX0, t3, 2
86+
ld.d t4, X, 0 * SIZE
87+
add.d X, X, INCX
88+
xvinsgr2vr.d VX0, t4, 3
89+
ld.d t1, X, 0 * SIZE
90+
add.d X, X, INCX
91+
xvinsgr2vr.d VX1, t1, 0
92+
ld.d t2, X, 0 * SIZE
93+
add.d X, X, INCX
94+
xvinsgr2vr.d VX1, t2, 1
95+
ld.d t3, X, 0 * SIZE
96+
add.d X, X, INCX
97+
xvinsgr2vr.d VX1, t3, 2
98+
ld.d t4, X, 0 * SIZE
99+
add.d X, X, INCX
100+
xvinsgr2vr.d VX1, t4, 3
101+
xvfmaxa.d VM1, VX0, VX1
102+
xvfmaxa.d VM0, VM0, VM1
103+
addi.d I, I, -1
104+
blt $r0, I, .L21
105+
b .L96
106+
.align 3
107+
108+
.L96:
109+
xvpickve.d VX0, VM0, 1
110+
xvpickve.d VX1, VM0, 2
111+
xvpickve.d VM3, VM0, 3
112+
xvfmaxa.d VM1, VX0, VX1
113+
xvfmaxa.d VM2, VM3, VM0
114+
xvfmaxa.d VM0, VM1, VM2
115+
.align 3
116+
117+
.L97:
118+
andi I, N, 7
119+
bge $r0, I, .L99
120+
.align 3
121+
122+
.L98:
123+
xvld VX1, X, 0
124+
xvfmaxa.d VM0, VM0, VX1
125+
addi.d I, I, -1
126+
add.d X, X, INCX
127+
blt $r0, I, .L98
128+
.align 3
129+
130+
.L99:
131+
fabs.d max, max
132+
lu12i.w TEMP, 0x3f800 // 1
133+
movgr2fr.d a1, $r0
134+
movgr2fr.w ALPHA, TEMP
135+
CMPEQ $fcc0, max, a1
136+
fcvt.d.s ALPHA, ALPHA
137+
bcnez $fcc0, .L999
138+
fdiv.d ALPHA, ALPHA, max
139+
CMPEQ $fcc0, INF, ALPHA
140+
bcnez $fcc0, .L999
141+
movfr2gr.d TEMP, ALPHA
142+
xvreplgr2vr.d VALPHA, TEMP
143+
144+
.L100:
145+
li.d TEMP, SIZE
146+
bne INCX, TEMP, .L120
147+
srai.d I, N, 3
148+
bge $r0, I, .L997
149+
.align 3
150+
151+
.L110:
152+
xvld VX0, XX, 0 * SIZE
153+
xvld VX1, XX, 4 * SIZE
154+
xvfmul.d VM0, VX0, VALPHA
155+
xvfmul.d VM1, VX1, VALPHA
156+
xvfmadd.d res1, VM0, VM0, res1
157+
xvfmadd.d res2, VM1, VM1, res2
158+
addi.d XX, XX, 8 * SIZE
159+
addi.d I, I, -1
160+
blt $r0, I, .L110
161+
b .L996
162+
.align 3
163+
164+
.L120:
165+
srai.d I, N, 3
166+
bge $r0, I, .L997
167+
168+
.L121:
169+
ld.d t1, XX, 0 * SIZE
170+
add.d XX, XX, INCX
171+
ld.d t2, XX, 0 * SIZE
172+
add.d XX, XX, INCX
173+
ld.d t3, XX, 0 * SIZE
174+
add.d XX, XX, INCX
175+
ld.d t4, XX, 0 * SIZE
176+
add.d XX, XX, INCX
177+
xvinsgr2vr.d VX0, t1, 0
178+
xvinsgr2vr.d VX0, t2, 1
179+
xvinsgr2vr.d VX0, t3, 2
180+
xvinsgr2vr.d VX0, t4, 3
181+
ld.d t1, XX, 0 * SIZE
182+
add.d XX, XX, INCX
183+
ld.d t2, XX, 0 * SIZE
184+
add.d XX, XX, INCX
185+
ld.d t3, XX, 0 * SIZE
186+
add.d XX, XX, INCX
187+
ld.d t4, XX, 0 * SIZE
188+
add.d XX, XX, INCX
189+
xvinsgr2vr.d VX0, t1, 0
190+
xvinsgr2vr.d VX0, t2, 1
191+
xvinsgr2vr.d VX1, t3, 2
192+
xvinsgr2vr.d VX1, t4, 3
193+
xvfmul.d VM0, VX0, VALPHA
194+
xvfmul.d VM1, VX1, VALPHA
195+
xvfmadd.d res1, VM0, VM0, res1
196+
xvfmadd.d res2, VM1, VM1, res2
197+
addi.d I, I, -1
198+
blt $r0, I, .L121
199+
b .L996
200+
.align 3
201+
202+
.L996:
203+
xvfadd.d res1, res1, res2
204+
xvpickve.d VX0, res1, 1
205+
xvpickve.d VX1, res1, 2
206+
xvpickve.d VM0, res1, 3
207+
xvfadd.d res1, VX0, res1
208+
xvfadd.d VX1, VX1, VM0
209+
xvfadd.d res1, VX1, res1
210+
.align 3
211+
212+
.L997:
213+
andi I, N, 7
214+
bge $r0, I, .L999
215+
.align 3
216+
217+
.L998:
218+
fld.d $f15, XX, 0 * SIZE
219+
addi.d I, I, -1
220+
fmul.d $f15, $f15, ALPHA
221+
fmadd.d $f19, $f15, $f15, $f19
222+
add.d XX, XX , INCX
223+
blt $r0, I, .L998
224+
fsqrt.d $f19, $f19
225+
fmul.d $f0, max, $f19
226+
jirl $r0, $r1, 0x0
227+
.align 3
228+
229+
.L999:
230+
fmov.d $f0, $f19
231+
jirl $r0, $r1, 0x0
232+
233+
EPILOGUE

0 commit comments

Comments
 (0)