Skip to content

Commit 20a8e48

Browse files
committed
LoongArch64: Update ssymv LASX version
1 parent e074858 commit 20a8e48

File tree

2 files changed

+206
-188
lines changed

2 files changed

+206
-188
lines changed

kernel/loongarch64/ssymv_L_lasx.S

Lines changed: 110 additions & 100 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2828
#define ASSEMBLER
2929

3030
#include "common.h"
31+
#include "loongarch64_asm.S"
3132

3233
/* Param */
3334
#define M $r4
@@ -57,6 +58,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
5758
#define T2 $r28
5859
#define T3 $r29
5960
#define T4 $r30
61+
#define T5 $r17
62+
#define T6 $r16
6063

6164
/* LSX vectors */
6265
#define U0 $xr31
@@ -87,75 +90,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
8790
#define a8 $f8
8891
#define a9 $f9
8992

90-
91-
PROLOGUE
92-
93-
LDARG BUFFER, $sp, 0
94-
95-
addi.d $sp, $sp, -88
96-
97-
SDARG $r23, $sp, 0
98-
SDARG $r24, $sp, 8
99-
SDARG $r25, $sp, 16
100-
SDARG $r26, $sp, 32
101-
SDARG $r27, $sp, 40
102-
SDARG $r28, $sp, 48
103-
SDARG $r29, $sp, 56
104-
SDARG $r30, $sp, 64
105-
SDARG $r31, $sp, 72
106-
ST ALPHA, $sp, 80
107-
108-
xvldrepl.w VALPHA, $sp, 80
109-
110-
slli.d LDA, LDA, BASE_SHIFT
111-
slli.d INCX, INCX, BASE_SHIFT
112-
slli.d INCY, INCY, BASE_SHIFT
113-
114-
bge $r0, M, .L999
115-
bge $r0, N, .L999
116-
117-
move J, $r0
118-
move JY, $r0
119-
move JX, $r0
120-
move AO1, A
121-
122-
beq J, N, .L999
123-
124-
.L01:
125-
MTC a2, $r0 //temp2
126-
fldx.s a6, X, JX
127-
fmul.s a3, ALPHA, a6 //temp1
128-
xvreplve0.w U3, U3
129-
xvreplve0.w U2, U2
130-
131-
mul.w T0, J, LDA
132-
slli.d T1, J, BASE_SHIFT
133-
add.w T0, T0, T1
134-
fldx.s a6, AO1, T0
135-
fldx.s a4, Y, JY
136-
fmadd.s a4, a3, a6, a4
137-
fstx.s a4, Y, JY
138-
139-
move IY, JY
140-
move IX, JX
141-
addi.d II, J, 1
142-
move I, II
143-
slli.d II, II, BASE_SHIFT
144-
145-
sub.d T0, M, J
146-
addi.d T0, T0, -1
147-
srai.d T0, T0, 3
148-
add.d T0, T0, J
149-
addi.d T0, T0, 1
150-
beq I, T0, .L03
151-
bge I, T0, .L03
152-
153-
mul.w T1, J, LDA
154-
add.d T1, T1, II
155-
156-
.L02: /* /8 */
157-
xvldx U1, AO1, T1
158-
93+
.macro LOAD_Y_8
94+
beqz T5, .L01_Y_0
15995
add.d T2, IY, INCY
16096
fldx.s $f4, Y, T2
16197
add.d T2, T2, INCY
@@ -180,11 +116,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
180116
vextrins.w $vr8, $vr9, 0x10
181117
vextrins.w $vr8, $vr10, 0x20
182118
vextrins.w $vr8, $vr11, 0x30
183-
xvpermi.q U4, U8, 0x02
184-
185-
xvfmadd.s U4, U3, U1, U4
186-
187-
xvpermi.d U8, U4, 0xee
119+
xvpermi.q U4, U8, 0x02
120+
b .L01_Y_1
121+
.L01_Y_0:
122+
add.d T3, IY, INCY
123+
xvldx U4, Y, T3
124+
.L01_Y_1:
125+
.endm
126+
127+
.macro STORE_Y_8
128+
beqz T5, .L01_Y_2
129+
xvpermi.d U8, U4, 0xee
188130
vextrins.w $vr5, $vr4, 0x01
189131
vextrins.w $vr6, $vr4, 0x02
190132
vextrins.w $vr7, $vr4, 0x03
@@ -209,10 +151,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
209151
fstx.s $f10, Y, T2
210152
add.d T2, T2, INCY
211153
fstx.s $f11, Y, T2
212-
213-
slli.d T2, INCY, 3
214-
add.d IY, IY, T2
215-
154+
b .L01_Y_3
155+
.L01_Y_2:
156+
xvstx U4, Y, T3
157+
.L01_Y_3:
158+
.endm
159+
160+
.macro LOAD_X_8
161+
beqz T6, .L01_X_0
216162
add.d T2, IX, INCX
217163
fldx.s $f4, X, T2
218164
add.d T2, T2, INCX
@@ -238,39 +184,103 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
238184
vextrins.w $vr8, $vr10, 0x20
239185
vextrins.w $vr8, $vr11, 0x30
240186
xvpermi.q U4, U8, 0x02
187+
b .L01_X_1
188+
.L01_X_0:
189+
add.d T3, IX, INCX
190+
xvldx U4, X, T3
191+
.L01_X_1:
192+
.endm
193+
194+
PROLOGUE
241195

242-
xvand.v $xr12, $xr2, $xr2
196+
addi.d $sp, $sp, -88
243197

244-
xvfmadd.s U2, U1, U4, U2
245-
xvfsub.s U2, U2, $xr12
198+
SDARG $r23, $sp, 0
199+
SDARG $r24, $sp, 8
200+
SDARG $r25, $sp, 16
201+
SDARG $r26, $sp, 32
202+
SDARG $r27, $sp, 40
203+
SDARG $r28, $sp, 48
204+
SDARG $r29, $sp, 56
205+
SDARG $r30, $sp, 64
206+
SDARG $r31, $sp, 72
207+
ST ALPHA, $sp, 80
246208

247-
xvpickve.w U4, U2, 0x01
248-
xvpickve.w U5, U2, 0x02
249-
xvpickve.w U6, U2, 0x03
250-
xvpickve.w U7, U2, 0x04
251-
xvpickve.w U8, U2, 0x05
252-
xvpickve.w U9, U2, 0x06
253-
xvpickve.w U10, U2, 0x07
209+
xvldrepl.w VALPHA, $sp, 80
254210

255-
fadd.s $f2, $f2, $f4
256-
fadd.s $f2, $f2, $f5
257-
fadd.s $f2, $f2, $f6
258-
fadd.s $f2, $f2, $f7
259-
fadd.s $f2, $f2, $f8
260-
fadd.s $f2, $f2, $f9
261-
fadd.s $f2, $f2, $f10
262-
fadd.s $f2, $f2, $f12
211+
addi.d T5, INCY, -1
212+
addi.d T6, INCX, -1
213+
slli.d LDA, LDA, BASE_SHIFT
214+
slli.d INCX, INCX, BASE_SHIFT
215+
slli.d INCY, INCY, BASE_SHIFT
263216

264-
xvreplve0.d U2, U2
217+
bge $r0, M, .L999
218+
bge $r0, N, .L999
219+
220+
move J, $r0
221+
move JY, $r0
222+
move JX, $r0
223+
move AO1, A
265224

266-
slli.d T2, INCX, 3
267-
add.d IX, IX, T2
225+
beq J, N, .L999
226+
227+
.L01:
228+
xvxor.v U2, U2, U2
229+
fldx.s a6, X, JX
230+
fmul.s a3, ALPHA, a6 //temp1
231+
xvreplve0.w U3, U3
232+
233+
mul.w T0, J, LDA
234+
slli.d T1, J, BASE_SHIFT
235+
add.w T0, T0, T1
236+
fldx.s a6, AO1, T0
237+
fldx.s a4, Y, JY
238+
fmadd.s a4, a3, a6, a4
239+
fstx.s a4, Y, JY
240+
241+
move IY, JY
242+
move IX, JX
243+
addi.d II, J, 1
244+
move I, II
245+
slli.d II, II, BASE_SHIFT
246+
247+
sub.d T0, M, J
248+
addi.d T0, T0, -1
249+
srai.d T0, T0, 3
250+
add.d T0, T0, J
251+
addi.d T0, T0, 1
252+
beq I, T0, .L03
253+
bge I, T0, .L03
254+
255+
mul.w T1, J, LDA
256+
add.d T1, T1, II
257+
258+
.L02: /* /8 */
259+
xvldx U1, AO1, T1
260+
261+
LOAD_Y_8
262+
263+
xvfmadd.s U4, U3, U1, U4
264+
265+
STORE_Y_8
266+
267+
alsl.d IY, INCY, IY, 3
268+
269+
LOAD_X_8
270+
271+
xvfmadd.s U2, U1, U4, U2
272+
273+
alsl.d IX, INCX, IX, 3
268274

269275
addi.d II, II, 32
270276
addi.d T1, T1, 32
271277
addi.d I, I, 1
272278
blt I, T0, .L02
273279

280+
//Acc U2
281+
GACC xvf, s, U4, U2
282+
fmov.d $f2, $f4
283+
274284
.L03: /* &4 */
275285
sub.d T0, M, J
276286
addi.d T0, T0, -1
@@ -433,4 +443,4 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
433443
addi.d $sp, $sp, 88
434444
jirl $r0, $r1, 0x0
435445

436-
EPILOGUE
446+
EPILOGUE

0 commit comments

Comments
 (0)