Skip to content

Commit a9070ba

Browse files
committed
LoongArch64: Update ssymv LSX version
1 parent 9b98103 commit a9070ba

File tree

2 files changed

+223
-178
lines changed

2 files changed

+223
-178
lines changed

kernel/loongarch64/ssymv_L_lsx.S

Lines changed: 118 additions & 94 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2828
#define ASSEMBLER
2929

3030
#include "common.h"
31+
#include "loongarch64_asm.S"
3132

3233
/* Param */
3334
#define M $r4
@@ -57,6 +58,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
5758
#define T2 $r28
5859
#define T3 $r29
5960
#define T4 $r30
61+
#define T5 $r17
62+
#define T6 $r16
6063

6164
/* LSX vectors */
6265
#define U0 $vr31
@@ -88,77 +91,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
8891
#define a9 $f9
8992

9093

91-
PROLOGUE
92-
93-
LDARG BUFFER, $sp, 0
94-
95-
addi.d $sp, $sp, -88
96-
97-
SDARG $r23, $sp, 0
98-
SDARG $r24, $sp, 8
99-
SDARG $r25, $sp, 16
100-
SDARG $r26, $sp, 32
101-
SDARG $r27, $sp, 40
102-
SDARG $r28, $sp, 48
103-
SDARG $r29, $sp, 56
104-
SDARG $r30, $sp, 64
105-
SDARG $r31, $sp, 72
106-
ST ALPHA, $sp, 80
107-
108-
vldrepl.w VALPHA, $sp, 80
109-
110-
slli.d LDA, LDA, BASE_SHIFT
111-
slli.d INCX, INCX, BASE_SHIFT
112-
slli.d INCY, INCY, BASE_SHIFT
113-
114-
bge $r0, M, .L999
115-
bge $r0, N, .L999
116-
117-
move J, $r0
118-
move JY, $r0
119-
move JX, $r0
120-
move AO1, A
121-
122-
beq J, N, .L999
123-
124-
.L01:
125-
MTC a2, $r0 //temp2
126-
fldx.s a6, X, JX
127-
fmul.s a3, ALPHA, a6 //temp1
128-
vpermi.w U3, U3, 0x00
129-
vpermi.w U2, U2, 0x00
130-
131-
mul.w T0, J, LDA
132-
slli.d T1, J, BASE_SHIFT
133-
add.w T0, T0, T1
134-
fldx.s a6, AO1, T0
135-
fldx.s a4, Y, JY
136-
fmadd.s a4, a3, a6, a4
137-
fstx.s a4, Y, JY
138-
139-
move IY, JY
140-
move IX, JX
141-
addi.d II, J, 1
142-
move I, II
143-
slli.d II, II, BASE_SHIFT
144-
145-
sub.d T0, M, J
146-
addi.d T0, T0, -1
147-
srai.d T0, T0, 3
148-
add.d T0, T0, J
149-
addi.d T0, T0, 1
150-
beq I, T0, .L03
151-
bge I, T0, .L03
152-
153-
mul.w T1, J, LDA
154-
add.d T1, T1, II
155-
156-
.L02: /* /8 */
157-
vldx U1, AO1, T1
158-
addi.d T1, T1, 16
159-
vldx U14, AO1, T1
160-
addi.d T1, T1, 16
161-
94+
.macro LOAD_Y_8
95+
beqz T5, .L01_Y_0
16296
add.d T2, IY, INCY
16397
fldx.s $f4, Y, T2
16498
add.d T2, T2, INCY
@@ -183,10 +117,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
183117
vextrins.w U8, U9, 0x10
184118
vextrins.w U8, U10, 0x20
185119
vextrins.w U8, U11, 0x30
186-
187-
vfmadd.s U4, U3, U1, U4
188-
vfmadd.s U8, U3, U14, U8
189-
120+
b .L01_Y_1
121+
.L01_Y_0:
122+
add.d T3, IY, INCY
123+
vldx U4, Y, T3
124+
alsl.d T4, INCY, T3, 2
125+
vldx U8, Y, T4
126+
.L01_Y_1:
127+
.endm
128+
129+
.macro STORE_Y_8
130+
beqz T5, .L01_Y_2
190131
vextrins.w U5, U4, 0x01
191132
vextrins.w U6, U4, 0x02
192133
vextrins.w U7, U4, 0x03
@@ -211,10 +152,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
211152
fstx.s $f10, Y, T2
212153
add.d T2, T2, INCY
213154
fstx.s $f11, Y, T2
214-
215-
slli.d T2, INCY, 3
216-
add.d IY, IY, T2
217-
155+
b .L01_Y_3
156+
.L01_Y_2:
157+
vstx U4, Y, T3
158+
vstx U8, Y, T4
159+
.L01_Y_3:
160+
.endm
161+
162+
.macro LOAD_X_8
163+
beqz T6, .L01_X_0
218164
add.d T2, IX, INCX
219165
fldx.s $f4, X, T2
220166
add.d T2, T2, INCX
@@ -239,31 +185,109 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
239185
vextrins.w $vr8, $vr9, 0x10
240186
vextrins.w $vr8, $vr10, 0x20
241187
vextrins.w $vr8, $vr11, 0x30
188+
b .L01_X_1
189+
.L01_X_0:
190+
add.d T3, IX, INCX
191+
vldx U4, X, T3
192+
alsl.d T4, INCX, T3, 2
193+
vldx U8, X, T4
194+
.L01_X_1:
195+
.endm
242196

243-
vand.v $vr12, $vr2, $vr2
197+
PROLOGUE
244198

245-
vfmadd.s U2, U1, U4, U2
246-
vfsub.s U2, U2, $vr12
247-
vfmadd.s U2, U14, U8, U2
199+
addi.d $sp, $sp, -88
248200

249-
vextrins.w U4, U2, 0x01
250-
vextrins.w U5, U2, 0x02
251-
vextrins.w U6, U2, 0x03
201+
SDARG $r23, $sp, 0
202+
SDARG $r24, $sp, 8
203+
SDARG $r25, $sp, 16
204+
SDARG $r26, $sp, 32
205+
SDARG $r27, $sp, 40
206+
SDARG $r28, $sp, 48
207+
SDARG $r29, $sp, 56
208+
SDARG $r30, $sp, 64
209+
SDARG $r31, $sp, 72
210+
ST ALPHA, $sp, 80
252211

253-
fadd.s $f2, $f2, $f4
254-
fadd.s $f2, $f2, $f5
255-
fadd.s $f2, $f2, $f6
256-
fadd.s $f2, $f2, $f12
212+
vldrepl.w VALPHA, $sp, 80
257213

258-
vpermi.w U2, U2, 0x00
214+
addi.d T5, INCY, -1
215+
addi.d T6, INCX, -1
216+
slli.d LDA, LDA, BASE_SHIFT
217+
slli.d INCX, INCX, BASE_SHIFT
218+
slli.d INCY, INCY, BASE_SHIFT
219+
220+
bge $r0, M, .L999
221+
bge $r0, N, .L999
222+
223+
move J, $r0
224+
move JY, $r0
225+
move JX, $r0
226+
move AO1, A
227+
228+
beq J, N, .L999
229+
230+
.L01:
231+
vxor.v U2, U2, U2
232+
fldx.s a6, X, JX
233+
fmul.s a3, ALPHA, a6 //temp1
234+
vpermi.w U3, U3, 0x00
235+
236+
mul.w T0, J, LDA
237+
slli.d T1, J, BASE_SHIFT
238+
add.w T0, T0, T1
239+
fldx.s a6, AO1, T0
240+
fldx.s a4, Y, JY
241+
fmadd.s a4, a3, a6, a4
242+
fstx.s a4, Y, JY
243+
244+
move IY, JY
245+
move IX, JX
246+
addi.d II, J, 1
247+
move I, II
248+
slli.d II, II, BASE_SHIFT
259249

260-
slli.d T2, INCX, 3
261-
add.d IX, IX, T2
250+
sub.d T0, M, J
251+
addi.d T0, T0, -1
252+
srai.d T0, T0, 3
253+
add.d T0, T0, J
254+
addi.d T0, T0, 1
255+
beq I, T0, .L03
256+
bge I, T0, .L03
257+
258+
mul.w T1, J, LDA
259+
add.d T1, T1, II
260+
261+
.L02: /* /8 */
262+
vldx U1, AO1, T1
263+
addi.d T1, T1, 16
264+
vldx U14, AO1, T1
265+
addi.d T1, T1, 16
266+
267+
LOAD_Y_8
268+
269+
vfmadd.s U4, U3, U1, U4
270+
vfmadd.s U8, U3, U14, U8
271+
272+
STORE_Y_8
273+
274+
alsl.d IY, INCY, IY, 3
275+
276+
LOAD_X_8
277+
278+
vfmadd.s U2, U1, U4, U2
279+
vfmadd.s U2, U14, U8, U2
280+
281+
alsl.d IX, INCX, IX, 3
262282

263283
addi.d II, II, 32
264284
addi.d I, I, 1
265285
blt I, T0, .L02
266286

287+
// Acc U2
288+
GACC vf, s, U4, U2
289+
vpermi.w U2, U4, 0
290+
267291
.L03: /* &4 */
268292
sub.d T0, M, J
269293
addi.d T0, T0, -1
@@ -426,4 +450,4 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
426450
addi.d $sp, $sp, 88
427451
jirl $r0, $r1, 0x0
428452

429-
EPILOGUE
453+
EPILOGUE

0 commit comments

Comments
 (0)