Skip to content

Commit d51ffec

Browse files
committed
LoongArch64: Opt cgemv with LASX
1 parent 99ef76f commit d51ffec

File tree

4 files changed

+968
-0
lines changed

4 files changed

+968
-0
lines changed

kernel/loongarch64/KERNEL.LOONGSON3R5

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,9 @@ CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
121121
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
122122
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
123123

124+
CGEMVNKERNEL = cgemv_n_8_lasx.S
125+
CGEMVTKERNEL = cgemv_t_8_lasx.S
126+
124127
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
125128
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
126129
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c

kernel/loongarch64/cgemv_n_8_lasx.S

Lines changed: 383 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,383 @@
1+
/*******************************************************************************
2+
Copyright (c) 2024, The OpenBLAS Project
3+
All rights reserved.
4+
Redistribution and use in source and binary forms, with or without
5+
modification, are permitted provided that the following conditions are
6+
met:
7+
1. Redistributions of source code must retain the above copyright
8+
notice, this list of conditions and the following disclaimer.
9+
2. Redistributions in binary form must reproduce the above copyright
10+
notice, this list of conditions and the following disclaimer in
11+
the documentation and/or other materials provided with the
12+
distribution.
13+
3. Neither the name of the OpenBLAS project nor the names of
14+
its contributors may be used to endorse or promote products
15+
derived from this software without specific prior written permission.
16+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19+
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25+
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
*******************************************************************************/
27+
#define ASSEMBLER
28+
29+
#include "common.h"
30+
#include "loongarch64_asm.S"
31+
32+
/*********************************************************************
33+
* 2024/02/20 guxiwei
34+
* UTEST : OK
35+
* CTEST : OK
36+
* TEST : OK
37+
*
38+
*
39+
*********************************************************************/
40+
41+
/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
42+
* FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
43+
*/
44+
#define M $r4
45+
#define N $r5
46+
#define ALPHA_R $f0
47+
#define ALPHA_I $f1
48+
#define A $r7
49+
#define LDA $r8
50+
#define X $r9
51+
#define INC_X $r10
52+
#define Y $r11
53+
#define INC_Y $r6
54+
55+
#define J $r12
56+
#define I $r13
57+
#define K $r14
58+
#define Y_ORG $r15
59+
#define OFFSET $r16
60+
#define K_LDA $r17
61+
#define M8 $r18
62+
#define T0 $r19
63+
#define PA0 $r20
64+
#define PA1 $r23
65+
#define PA2 $r24
66+
#define PA3 $r25
67+
#define PA4 $r26
68+
#define PA5 $r27
69+
#define PA6 $r28
70+
#define PA7 $r29
71+
72+
#define VALPHA $xr1
73+
#define X0 $xr2
74+
#define X1 $xr3
75+
#define X2 $xr4
76+
#define X3 $xr5
77+
#define X4 $xr6
78+
#define X5 $xr7
79+
#define X6 $xr8
80+
#define X7 $xr9
81+
#define Y0 $xr10
82+
#define Y1 $xr11
83+
#define A0 $xr12
84+
#define A1 $xr13
85+
#define A2 $xr14
86+
#define A3 $xr15
87+
#define A4 $xr16
88+
#define A5 $xr17
89+
#define A6 $xr18
90+
#define A7 $xr19
91+
#define A8 $xr20
92+
#define A9 $xr21
93+
#define A10 $xr22
94+
#define A11 $xr23
95+
#define A12 $xr24
96+
#define A13 $xr25
97+
#define A14 $xr26
98+
#define A15 $xr27
99+
#define TMP0 $xr28
100+
#define TMP1 $xr29
101+
#define TMP2 $xr30
102+
103+
#if !defined(CONJ)
104+
#if !defined(XCONJ)
105+
#define GXCONJ 0
106+
#define GCONJ 0
107+
#else
108+
#define GXCONJ 1
109+
#define GCONJ 0
110+
#endif
111+
#else
112+
#if !defined(XCONJ)
113+
#define GXCONJ 0
114+
#define GCONJ 1
115+
#else
116+
#define GXCONJ 1
117+
#define GCONJ 1
118+
#endif
119+
#endif
120+
121+
.macro CLOAD_X_8
122+
GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18, \
123+
X4, X, 0x20, X5, X, 0x28, X6, X, 0x30, X7, X, 0x38
124+
GCOMPLEXMUL GXCONJ, \
125+
xvf, s, X0, X0, VALPHA, TMP0, TMP1, TMP2, \
126+
X1, X1, VALPHA, TMP0, TMP1, TMP2, \
127+
X2, X2, VALPHA, TMP0, TMP1, TMP2, \
128+
X3, X3, VALPHA, TMP0, TMP1, TMP2, \
129+
X4, X4, VALPHA, TMP0, TMP1, TMP2, \
130+
X5, X5, VALPHA, TMP0, TMP1, TMP2, \
131+
X6, X6, VALPHA, TMP0, TMP1, TMP2, \
132+
X7, X7, VALPHA, TMP0, TMP1, TMP2
133+
.endm
134+
135+
.macro CLOAD_X_8_GAP
136+
xvldrepl.d X0, X, 0x00
137+
PTR_ADD T0, X, INC_X
138+
xvldrepl.d X1, T0, 0x00
139+
PTR_ADD T0, T0, INC_X
140+
xvldrepl.d X2, T0, 0x00
141+
PTR_ADD T0, T0, INC_X
142+
xvldrepl.d X3, T0, 0x00
143+
PTR_ADD T0, T0, INC_X
144+
xvldrepl.d X4, T0, 0x00
145+
PTR_ADD T0, T0, INC_X
146+
xvldrepl.d X5, T0, 0x00
147+
PTR_ADD T0, T0, INC_X
148+
xvldrepl.d X6, T0, 0x00
149+
PTR_ADD T0, T0, INC_X
150+
xvldrepl.d X7, T0, 0x00
151+
152+
GCOMPLEXMUL GXCONJ, \
153+
xvf, s, X0, X0, VALPHA, TMP0, TMP1, TMP2, \
154+
X1, X1, VALPHA, TMP0, TMP1, TMP2, \
155+
X2, X2, VALPHA, TMP0, TMP1, TMP2, \
156+
X3, X3, VALPHA, TMP0, TMP1, TMP2, \
157+
X4, X4, VALPHA, TMP0, TMP1, TMP2, \
158+
X5, X5, VALPHA, TMP0, TMP1, TMP2, \
159+
X6, X6, VALPHA, TMP0, TMP1, TMP2, \
160+
X7, X7, VALPHA, TMP0, TMP1, TMP2
161+
.endm
162+
163+
.macro CLOAD_Y_8
164+
GLD xv, , Y0, Y, 0, Y1, Y, 0x20
165+
.endm
166+
167+
.macro CLOAD_Y_8_GAP
168+
fld.d $f10, Y, 0
169+
fldx.d $f13, Y, INC_Y
170+
PTR_ALSL T0, INC_Y, Y, 1
171+
fld.d $f14, T0, 0
172+
fldx.d $f15, T0, INC_Y
173+
PTR_ALSL T0, INC_Y, Y, 2
174+
fld.d $f11, T0, 0
175+
fldx.d $f17, T0, INC_Y
176+
PTR_ADD T0, T0, INC_Y
177+
PTR_ADD T0, T0, INC_Y
178+
fld.d $f18, T0, 0
179+
fldx.d $f19, T0, INC_Y
180+
GINSVE0 xv, d, Y0, A1, 1, Y0, A2, 2, Y0, A3, 3, Y1, A5, 1, Y1, A6, 2, Y1, A7, 3
181+
.endm
182+
183+
.macro CSTORE_Y_8_GAP
184+
xvstelm.d Y0, Y, 0, 0
185+
PTR_ADD T0, Y, INC_Y
186+
xvstelm.d Y0, T0, 0, 1
187+
PTR_ADD T0, T0, INC_Y
188+
xvstelm.d Y0, T0, 0, 2
189+
PTR_ADD T0, T0, INC_Y
190+
xvstelm.d Y0, T0, 0, 3
191+
192+
PTR_ADD T0, T0, INC_Y
193+
xvstelm.d Y1, T0, 0, 0
194+
PTR_ADD T0, T0, INC_Y
195+
xvstelm.d Y1, T0, 0, 1
196+
PTR_ADD T0, T0, INC_Y
197+
xvstelm.d Y1, T0, 0, 2
198+
PTR_ADD T0, T0, INC_Y
199+
xvstelm.d Y1, T0, 0, 3
200+
.endm
201+
202+
.macro CGEMV_N_8x8
203+
GLD_INC xv, , 0x20, \
204+
A0, PA0, 0, A1, PA0, 0, \
205+
A2, PA1, 0, A3, PA1, 0, \
206+
A4, PA2, 0, A5, PA2, 0, \
207+
A6, PA3, 0, A7, PA3, 0, \
208+
A8, PA4, 0, A9, PA4, 0, \
209+
A10, PA5, 0, A11, PA5, 0, \
210+
A12, PA6, 0, A13, PA6, 0, \
211+
A14, PA7, 0, A15, PA7, 0
212+
213+
GCOMPLEXMADD GXCONJ, GCONJ, \
214+
xvf, s, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, Y1, X0, A1, Y1, TMP0, TMP1, TMP2, \
215+
Y0, X1, A2, Y0, TMP0, TMP1, TMP2, Y1, X1, A3, Y1, TMP0, TMP1, TMP2, \
216+
Y0, X2, A4, Y0, TMP0, TMP1, TMP2, Y1, X2, A5, Y1, TMP0, TMP1, TMP2, \
217+
Y0, X3, A6, Y0, TMP0, TMP1, TMP2, Y1, X3, A7, Y1, TMP0, TMP1, TMP2, \
218+
Y0, X4, A8, Y0, TMP0, TMP1, TMP2, Y1, X4, A9, Y1, TMP0, TMP1, TMP2, \
219+
Y0, X5, A10, Y0, TMP0, TMP1, TMP2, Y1, X5, A11, Y1, TMP0, TMP1, TMP2, \
220+
Y0, X6, A12, Y0, TMP0, TMP1, TMP2, Y1, X6, A13, Y1, TMP0, TMP1, TMP2, \
221+
Y0, X7, A14, Y0, TMP0, TMP1, TMP2, Y1, X7, A15, Y1, TMP0, TMP1, TMP2
222+
.endm
223+
224+
.macro CSTORE_Y_8
225+
GST xv, , Y0, Y, 0, Y1, Y, 0x20
226+
.endm
227+
228+
.macro CLOAD_X_1
229+
GLDREPL xv, d, X0, X, 0x00
230+
GCOMPLEXMUL GXCONJ, \
231+
xvf, s, X0, X0, VALPHA, TMP0, TMP1, TMP2
232+
.endm
233+
234+
.macro CLOAD_Y_1
235+
fld.d $f10, Y, 0
236+
.endm
237+
238+
.macro CGEMV_N_1x8
239+
GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0, $f16, PA2, 0, $f18, PA3, 0, \
240+
$f20, PA4, 0, $f22, PA5, 0, $f24, PA6, 0, $f26, PA7, 0
241+
GCOMPLEXMADD GXCONJ, GCONJ, \
242+
xvf, s, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, \
243+
Y0, X1, A2, Y0, TMP0, TMP1, TMP2, \
244+
Y0, X2, A4, Y0, TMP0, TMP1, TMP2, \
245+
Y0, X3, A6, Y0, TMP0, TMP1, TMP2, \
246+
Y0, X4, A8, Y0, TMP0, TMP1, TMP2, \
247+
Y0, X5, A10, Y0, TMP0, TMP1, TMP2, \
248+
Y0, X6, A12, Y0, TMP0, TMP1, TMP2, \
249+
Y0, X7, A14, Y0, TMP0, TMP1, TMP2
250+
.endm
251+
252+
.macro CSTORE_Y_1
253+
fst.d $f10, Y, 0
254+
.endm
255+
256+
.macro CGEMV_N_1x1
257+
fld.d $f12, PA0, 0
258+
PTR_ADDI PA0, PA0, 0x08
259+
GCOMPLEXMADD GXCONJ, GCONJ, \
260+
xvf, s, Y0, X0, A0, Y0, TMP0, TMP1, TMP2
261+
.endm
262+
263+
.macro CGEMV_N_LASX XW:req, X_8:req, X_1:req, Y_8:req, Y_1:req
264+
PTR_SRLI J, N, 3
265+
beqz J, .L_\XW\()_N_7
266+
PTR_SLLI K_LDA, LDA, 3
267+
PTR_SUB K_LDA, K_LDA, M8
268+
.L_\XW\()_N_L8:
269+
CLOAD_\X_8
270+
xor K, K, K
271+
move Y, Y_ORG
272+
PTR_SRLI I, M, 3
273+
beqz I, .L_\XW\()_M_7
274+
.align 5
275+
.L_\XW\()_M_L8:
276+
CLOAD_\Y_8
277+
CGEMV_N_8x8
278+
CSTORE_\Y_8
279+
PTR_ADDI I, I, -1
280+
PTR_ALSL Y, INC_Y, Y, 3
281+
PTR_ADDI K, K, 8
282+
bnez I, .L_\XW\()_M_L8
283+
.L_\XW\()_M_7:
284+
andi I, M, 7
285+
beqz I, .L_\XW\()_M_END
286+
.align 5
287+
.L_\XW\()_M_L1:
288+
CLOAD_\Y_1
289+
CGEMV_N_1x8
290+
CSTORE_\Y_1
291+
PTR_ADDI I, I, -1
292+
PTR_ADD Y, Y, INC_Y
293+
PTR_ADDI K, K, 1
294+
bnez I, .L_\XW\()_M_L1
295+
.L_\XW\()_M_END:
296+
PTR_ADDI J, J, -1
297+
#if __loongarch_grlen == 64
298+
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
299+
PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
300+
#elif __loongarch_grlen == 32
301+
GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
302+
PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
303+
#else
304+
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
305+
PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
306+
#endif
307+
PTR_ALSL X, INC_X, X, 3
308+
bnez J, .L_\XW\()_N_L8
309+
.L_\XW\()_N_7:
310+
andi J, N, 7
311+
beqz J, .L_END
312+
.L_\XW\()_N_L1:
313+
CLOAD_\X_1
314+
xor K, K, K
315+
move Y, Y_ORG
316+
move I, M
317+
beqz I, .L_END
318+
.align 5
319+
.L_\XW\()_N_1_M_L1:
320+
CLOAD_\Y_1
321+
CGEMV_N_1x1
322+
CSTORE_\Y_1
323+
PTR_ADDI I, I, -1
324+
PTR_ADD Y, Y, INC_Y
325+
PTR_ADDI K, K, 1
326+
bnez I, .L_\XW\()_N_1_M_L1
327+
.L_\XW\()_N_1_M_END:
328+
PTR_ADDI J, J, -1
329+
PTR_SUB K_LDA, LDA, M8
330+
PTR_ADD PA0, PA0, K_LDA
331+
PTR_ADD X, X, INC_X
332+
bnez J, .L_\XW\()_N_L1
333+
334+
b .L_END
335+
.endm
336+
337+
PROLOGUE
338+
PTR_LD INC_Y, $sp, 0
339+
push_if_used 17 + 7, 31
340+
PTR_ADDI K, $r0, 0x01
341+
PTR_SUB I, INC_X, K
342+
PTR_SUB J, INC_Y, K
343+
maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */
344+
maskeqz J, K, J /* if(inc_y == 1) j = 0; else j = 1; */
345+
PTR_ALSL I, I, J, 1
346+
GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3
347+
// Init VALPHA
348+
xvpackev.w $xr0, $xr1, $xr0
349+
xvreplve0.d VALPHA, $xr0
350+
move Y_ORG, Y
351+
move PA0, A
352+
#if __loongarch_grlen == 64
353+
GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
354+
PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
355+
#elif __loongarch_grlen == 32
356+
GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
357+
PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
358+
#else
359+
GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
360+
PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
361+
#endif
362+
la.local T0, .L_GAP_TABLE
363+
PTR_ALSL I, I, T0, 1
364+
ld.h K, I, 0 // Obtain the offset address
365+
PTR_ADD T0, T0, K
366+
jirl $r0, T0, 0
367+
.L_GAP_TABLE:
368+
.hword .L_GAP_0_0 - .L_GAP_TABLE
369+
.hword .L_GAP_0_1 - .L_GAP_TABLE
370+
.hword .L_GAP_1_0 - .L_GAP_TABLE
371+
.hword .L_GAP_1_1 - .L_GAP_TABLE
372+
.L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */
373+
CGEMV_N_LASX GAP_0_0, X_8, X_1, Y_8, Y_1
374+
.L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */
375+
CGEMV_N_LASX GAP_0_1, X_8, X_1, Y_8_GAP, Y_1
376+
.L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */
377+
CGEMV_N_LASX GAP_1_0, X_8_GAP, X_1, Y_8, Y_1
378+
.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */
379+
CGEMV_N_LASX GAP_1_1, X_8_GAP, X_1, Y_8_GAP, Y_1
380+
.L_END:
381+
pop_if_used 17 + 7, 31
382+
jirl $r0, $r1, 0x0
383+
EPILOGUE

0 commit comments

Comments
 (0)