Skip to content

Commit 9fe07d8

Browse files
committed
loongarch: Add LSX optimization for dot.
1 parent 13b8c44 commit 9fe07d8

File tree

2 files changed

+371
-0
lines changed

2 files changed

+371
-0
lines changed
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
ifndef NO_LSX
2+
3+
SDOTKERNEL = dot_lsx.S
4+
DSDOTKERNEL = dot_lsx.S
5+
DDOTKERNEL = dot_lsx.S
6+
7+
endif

kernel/loongarch64/dot_lsx.S

Lines changed: 364 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,364 @@
1+
/***************************************************************************
2+
Copyright (c) 2023, The OpenBLAS Project
3+
All rights reserved.
4+
Redistribution and use in source and binary forms, with or without
5+
modification, are permitted provided that the following conditions are
6+
met:
7+
1. Redistributions of source code must retain the above copyright
8+
notice, this list of conditions and the following disclaimer.
9+
2. Redistributions in binary form must reproduce the above copyright
10+
notice, this list of conditions and the following disclaimer in
11+
the documentation and/or other materials provided with the
12+
distribution.
13+
3. Neither the name of the OpenBLAS project nor the names of
14+
its contributors may be used to endorse or promote products
15+
derived from this software without specific prior written permission.
16+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19+
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25+
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
*****************************************************************************/
27+
28+
#define ASSEMBLER
29+
30+
#include "common.h"
31+
32+
#define N $r4
33+
#define X $r5
34+
#define INCX $r6
35+
#define Y $r7
36+
#define INCY $r8
37+
38+
#define I $r17
39+
#define TEMP $r18
40+
41+
/* Don't change following FR unless you know the effects. */
42+
#define s1 $f8
43+
#define s2 $f9
44+
#define a1 $f10
45+
#define b1 $f11
46+
47+
PROLOGUE
48+
49+
#ifdef F_INTERFACE
50+
LDINT N, 0(N)
51+
LDINT INCX, 0(INCX)
52+
LDINT INCY, 0(INCY)
53+
#endif
54+
55+
/* init $f8 and $f9 to zero */
56+
SUB s1, s1, s1
57+
SUB s2, s2, s2
58+
slli.d INCX, INCX, BASE_SHIFT
59+
li.d TEMP, SIZE
60+
slli.d INCY, INCY, BASE_SHIFT
61+
bge $r0, N, .L999
62+
bne INCX, TEMP, .L20 /* inc_x=1 */
63+
bne INCY, TEMP, .L20 /* inc_y=1 */
64+
65+
/* !((inc_x == 1) && (inc_y == 1)) */
66+
67+
/* init $vr8 and $vr9 to zero */
68+
#ifdef DOUBLE
69+
vldrepl.d $vr0, X, 0
70+
#else
71+
vldrepl.w $vr0, X, 0
72+
#endif
73+
#ifdef DSDOT
74+
vfcvtl.d.s $vr0, $vr0
75+
vfsub.d $vr8, $vr0, $vr0
76+
vfsub.d $vr9, $vr0, $vr0
77+
#else
78+
VFSUB $vr8, $vr0, $vr0
79+
VFSUB $vr9, $vr0, $vr0
80+
#endif
81+
82+
#ifdef DOUBLE
83+
srai.d I, N, 3
84+
#else
85+
srai.d I, N, 4
86+
#endif
87+
bge $r0, I, .L12 /* FLOAT: <16 ; DOUBLE: <8 */
88+
.align 3
89+
.L11:
90+
/* FLOAT: 16~ ; DOUBLE: 8~ */
91+
vld $vr0, X, 0
92+
vld $vr1, X, 16
93+
vld $vr2, X, 32
94+
vld $vr3, X, 48
95+
vld $vr4, Y, 0
96+
vld $vr5, Y, 16
97+
vld $vr6, Y, 32
98+
vld $vr7, Y, 48
99+
addi.w I, I, -1
100+
addi.d X, X, 64
101+
addi.d Y, Y, 64
102+
#ifdef DSDOT
103+
vfcvtl.d.s $vr10, $vr0
104+
vfcvtl.d.s $vr11, $vr4
105+
vfcvth.d.s $vr12, $vr0
106+
vfcvth.d.s $vr13, $vr4
107+
vfmadd.d $vr8, $vr10, $vr12, $vr8
108+
vfmadd.d $vr9, $vr11, $vr13, $vr9
109+
vfcvtl.d.s $vr10, $vr1
110+
vfcvtl.d.s $vr11, $vr5
111+
vfcvth.d.s $vr12, $vr1
112+
vfcvth.d.s $vr13, $vr5
113+
vfmadd.d $vr8, $vr10, $vr12, $vr8
114+
vfmadd.d $vr9, $vr11, $vr13, $vr9
115+
vfcvtl.d.s $vr10, $vr2
116+
vfcvtl.d.s $vr11, $vr6
117+
vfcvth.d.s $vr12, $vr2
118+
vfcvth.d.s $vr13, $vr6
119+
vfmadd.d $vr8, $vr10, $vr12, $vr8
120+
vfmadd.d $vr9, $vr11, $vr13, $vr9
121+
vfcvtl.d.s $vr10, $vr3
122+
vfcvtl.d.s $vr11, $vr7
123+
vfcvth.d.s $vr12, $vr3
124+
vfcvth.d.s $vr13, $vr7
125+
vfmadd.d $vr8, $vr10, $vr12, $vr8
126+
vfmadd.d $vr9, $vr11, $vr13, $vr9
127+
#else
128+
VFMADD $vr8, $vr0, $vr4, $vr8
129+
VFMADD $vr9, $vr1, $vr5, $vr9
130+
VFMADD $vr8, $vr2, $vr6, $vr8
131+
VFMADD $vr9, $vr3, $vr7, $vr9
132+
#endif
133+
bnez I, .L11
134+
.align 3
135+
.L12:
136+
#ifdef DOUBLE
137+
andi I, N, 0x7
138+
srai.d I, I, 1
139+
#else
140+
andi I, N, 0xf
141+
srai.d I, I, 2
142+
#endif
143+
bge $r0, I, .L14 /* DOUBLE: <2 ; FLOAT: <4 */
144+
.align 3
145+
.L13:
146+
/* FLOAT: 4~15 ; DOUBLE: 2~7 */
147+
vld $vr0, X, 0
148+
vld $vr4, Y, 0
149+
addi.w I, I, -1
150+
addi.d X, X, 16
151+
addi.d Y, Y, 16
152+
#ifdef DSDOT
153+
vfcvtl.d.s $vr10, $vr0
154+
vfcvtl.d.s $vr11, $vr4
155+
vfcvth.d.s $vr12, $vr0
156+
vfcvth.d.s $vr13, $vr4
157+
vfmadd.d $vr8, $vr10, $vr12, $vr8
158+
vfmadd.d $vr9, $vr11, $vr13, $vr9
159+
#else
160+
VFMADD $vr8, $vr0, $vr4, $vr8
161+
#endif
162+
bnez I, .L13
163+
.align 3
164+
.L14:
165+
/* store dot in s1 $f8 */
166+
#ifdef DSDOT
167+
vfadd.d $vr8, $vr8, $vr9
168+
fsub.s s2, s2, s2, /* set s2 to 0.0 */
169+
vpackod.d $vr0, $vr8, $vr8
170+
vfadd.d $vr8, $vr8, $vr0
171+
#else
172+
VFADD $vr8, $vr8, $vr9
173+
SUB s2, s2, s2 /* set s2 to 0.0 */
174+
vpackod.d $vr0, $vr8, $vr8
175+
#ifdef DOUBLE
176+
VFADD $vr8, $vr8, $vr0
177+
#else
178+
VFADD $vr8, $vr8, $vr0
179+
vpackod.w $vr0, $vr8, $vr8
180+
VFADD $vr8, $vr8, $vr0
181+
#endif /* defined DOUBLE */
182+
#endif /* defined DSDOT */
183+
.align 3
184+
.L15:
185+
#ifdef DOUBLE
186+
andi I, N, 0x1
187+
#else
188+
andi I, N, 0x3
189+
#endif
190+
bge $r0, I, .L999 /* =0 */
191+
.align 3
192+
.L16:
193+
/* DOUBLE: 1 ; FLOAT: 1~3 */
194+
LD a1, X, 0
195+
LD b1, Y, 0
196+
#ifdef DSDOT
197+
fcvt.d.s a1, a1
198+
fcvt.d.s b1, b1
199+
fmadd.d s1, b1, a1, s1
200+
#else
201+
MADD s1, b1, a1, s1
202+
#endif
203+
addi.d I, I, -1
204+
addi.d X, X, SIZE
205+
addi.d Y, Y, SIZE
206+
bnez I, .L16
207+
b .L999
208+
.align 3
209+
210+
.L20:
211+
/* !((inc_x == 1) && (inc_y == 1)) */
212+
srai.d I, N, 3
213+
#ifdef F_INTERFACE
214+
bgez INCX, .L21
215+
addi.d TEMP, N, -1
216+
mult TEMP, INCX
217+
mflo TEMP
218+
dsub X, X, TEMP
219+
.align 3
220+
221+
.L21:
222+
bgez INCY, .L22
223+
addi.d TEMP, N, -1
224+
mult TEMP, INCY
225+
mflo TEMP
226+
dsub Y, Y, TEMP
227+
.align 3
228+
229+
.L22:
230+
#endif
231+
bge $r0, I, .L25 /* <8 */
232+
.align 3
233+
234+
.L23:
235+
LD a1, X, 0 * SIZE
236+
add.d X, X, INCX
237+
LD b1, Y, 0 * SIZE
238+
add.d Y, Y, INCY
239+
#ifdef DSDOT
240+
fcvt.d.s a1, a1
241+
fcvt.d.s b1, b1
242+
fmadd.d s1, b1, a1, s1
243+
#else
244+
MADD s1, b1, a1, s1
245+
#endif
246+
247+
LD a1, X, 0 * SIZE
248+
add.d X, X, INCX
249+
LD b1, Y, 0 * SIZE
250+
add.d Y, Y, INCY
251+
#ifdef DSDOT
252+
fcvt.d.s a1, a1
253+
fcvt.d.s b1, b1
254+
fmadd.d s2, b1, a1, s2
255+
#else
256+
MADD s2, b1, a1, s2
257+
#endif
258+
259+
LD a1, X, 0 * SIZE
260+
add.d X, X, INCX
261+
LD b1, Y, 0 * SIZE
262+
add.d Y, Y, INCY
263+
#ifdef DSDOT
264+
fcvt.d.s a1, a1
265+
fcvt.d.s b1, b1
266+
fmadd.d s1, b1, a1, s1
267+
#else
268+
MADD s1, b1, a1, s1
269+
#endif
270+
271+
LD a1, X, 0 * SIZE
272+
add.d X, X, INCX
273+
LD b1, Y, 0 * SIZE
274+
add.d Y, Y, INCY
275+
#ifdef DSDOT
276+
fcvt.d.s a1, a1
277+
fcvt.d.s b1, b1
278+
fmadd.d s2, b1, a1, s2
279+
#else
280+
MADD s2, b1, a1, s2
281+
#endif
282+
283+
LD a1, X, 0 * SIZE
284+
add.d X, X, INCX
285+
LD b1, Y, 0 * SIZE
286+
add.d Y, Y, INCY
287+
#ifdef DSDOT
288+
fcvt.d.s a1, a1
289+
fcvt.d.s b1, b1
290+
fmadd.d s1, b1, a1, s1
291+
#else
292+
MADD s1, b1, a1, s1
293+
#endif
294+
295+
LD a1, X, 0 * SIZE
296+
add.d X, X, INCX
297+
LD b1, Y, 0 * SIZE
298+
add.d Y, Y, INCY
299+
#ifdef DSDOT
300+
fcvt.d.s a1, a1
301+
fcvt.d.s b1, b1
302+
fmadd.d s2, b1, a1, s2
303+
#else
304+
MADD s2, b1, a1, s2
305+
#endif
306+
307+
LD a1, X, 0 * SIZE
308+
add.d X, X, INCX
309+
LD b1, Y, 0 * SIZE
310+
add.d Y, Y, INCY
311+
#ifdef DSDOT
312+
fcvt.d.s a1, a1
313+
fcvt.d.s b1, b1
314+
fmadd.d s1, b1, a1, s1
315+
#else
316+
MADD s1, b1, a1, s1
317+
#endif
318+
319+
LD a1, X, 0 * SIZE
320+
add.d X, X, INCX
321+
LD b1, Y, 0 * SIZE
322+
add.d Y, Y, INCY
323+
addi.d I, I, -1
324+
#ifdef DSDOT
325+
fcvt.d.s a1, a1
326+
fcvt.d.s b1, b1
327+
fmadd.d s2, b1, a1, s2
328+
#else
329+
MADD s2, b1, a1, s2
330+
#endif
331+
blt $r0, I, .L23
332+
.align 3
333+
334+
.L25:
335+
andi I, N, 7
336+
bge $r0, I, .L999
337+
.align 3
338+
339+
.L26:
340+
LD a1, X, 0 * SIZE
341+
add.d X, X, INCX
342+
LD b1, Y, 0 * SIZE
343+
add.d Y, Y, INCY
344+
addi.d I, I, -1
345+
#ifdef DSDOT
346+
fcvt.d.s a1, a1
347+
fcvt.d.s b1, b1
348+
fmadd.d s1, b1, a1, s1
349+
#else
350+
MADD s1, b1, a1, s1
351+
#endif
352+
blt $r0, I, .L26
353+
.align 3
354+
355+
.L999:
356+
#ifdef DSDOT
357+
fadd.d $f0, s1, s2
358+
#else
359+
ADD $f0, s1, s2
360+
#endif
361+
move $r4, $r17
362+
jirl $r0, $r1, 0x0
363+
364+
EPILOGUE

0 commit comments

Comments
 (0)