Skip to content

Commit e0a8216

Browse files
committed
LoongArch64: Update dsymv LSX version
1 parent a9070ba commit e0a8216

File tree

2 files changed

+241
-167
lines changed

2 files changed

+241
-167
lines changed

kernel/loongarch64/dsymv_L_lsx.S

Lines changed: 123 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2828
#define ASSEMBLER
2929

3030
#include "common.h"
31+
#include "loongarch64_asm.S"
3132

3233
/* Param */
3334
#define M $r4
@@ -57,6 +58,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
5758
#define T2 $r28
5859
#define T3 $r29
5960
#define T4 $r30
61+
#define T5 $r17
62+
#define T6 $r16
63+
#define T7 $r12
6064

6165
/* LSX vectors */
6266
#define U0 $vr31
@@ -87,10 +91,114 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
8791
#define a8 $f8
8892
#define a9 $f9
8993

94+
.macro LOAD_Y_8
95+
beqz T5, .L01_Y_0
96+
add.d T2, IY, INCY
97+
fldx.d $f4, Y, T2
98+
add.d T2, T2, INCY
99+
fldx.d $f5, Y, T2
100+
add.d T2, T2, INCY
101+
fldx.d $f6, Y, T2
102+
add.d T2, T2, INCY
103+
fldx.d $f7, Y, T2
90104

91-
PROLOGUE
105+
add.d T2, T2, INCY
106+
fldx.d $f8, Y, T2
107+
add.d T2, T2, INCY
108+
fldx.d $f9, Y, T2
109+
add.d T2, T2, INCY
110+
fldx.d $f10, Y, T2
111+
add.d T2, T2, INCY
112+
fldx.d $f11, Y, T2
113+
114+
vextrins.d U4, U5, 0x10
115+
vextrins.d U6, U7, 0x10
116+
vextrins.d U8, U9, 0x10
117+
vextrins.d U10, U11, 0x10
118+
b .L01_Y_1
119+
.L01_Y_0:
120+
add.d T7, IY, INCY
121+
vldx U4, Y, T7
122+
alsl.d T2, INCY, T7, 1
123+
vldx U6, Y, T2
124+
alsl.d T3, INCY, T2, 1
125+
vldx U8, Y, T3
126+
alsl.d T4, INCY, T3, 1
127+
vldx U10, Y, T4
128+
.L01_Y_1:
129+
.endm
130+
131+
.macro LOAD_X_8
132+
beqz T6, .L01_X_0
133+
add.d T2, IX, INCX
134+
fldx.d $f4, X, T2
135+
add.d T2, T2, INCX
136+
fldx.d $f5, X, T2
137+
add.d T2, T2, INCX
138+
fldx.d $f6, X, T2
139+
add.d T2, T2, INCX
140+
fldx.d $f7, X, T2
141+
142+
add.d T2, T2, INCX
143+
fldx.d $f8, X, T2
144+
add.d T2, T2, INCX
145+
fldx.d $f9, X, T2
146+
add.d T2, T2, INCX
147+
fldx.d $f10, X, T2
148+
add.d T2, T2, INCX
149+
fldx.d $f11, X, T2
150+
151+
vextrins.d U4, U5, 0x10
152+
vextrins.d U6, U7, 0x10
153+
vextrins.d U8, U9, 0x10
154+
vextrins.d U10, U11, 0x10
155+
b .L01_X_1
156+
.L01_X_0:
157+
add.d T7, IX, INCX
158+
vldx U4, X, T7
159+
alsl.d T2, INCX, T7, 1
160+
vldx U6, X, T2
161+
alsl.d T3, INCX, T2, 1
162+
vldx U8, X, T3
163+
alsl.d T4, INCX, T3, 1
164+
vldx U10, X, T4
165+
.L01_X_1:
166+
.endm
167+
168+
.macro STORE_Y_8
169+
beqz T5, .L01_Y_2
170+
vextrins.d U5, U4, 0x01
171+
vextrins.d U7, U6, 0x01
172+
vextrins.d U9, U8, 0x01
173+
vextrins.d U11, U10, 0x01
174+
175+
add.d T2, IY, INCY
176+
fstx.d $f4, Y, T2
177+
add.d T2, T2, INCY
178+
fstx.d $f5, Y, T2
179+
add.d T2, T2, INCY
180+
fstx.d $f6, Y, T2
181+
add.d T2, T2, INCY
182+
fstx.d $f7, Y, T2
183+
184+
add.d T2, T2, INCY
185+
fstx.d $f8, Y, T2
186+
add.d T2, T2, INCY
187+
fstx.d $f9, Y, T2
188+
add.d T2, T2, INCY
189+
fstx.d $f10, Y, T2
190+
add.d T2, T2, INCY
191+
fstx.d $f11, Y, T2
192+
b .L01_Y_3
193+
.L01_Y_2:
194+
vstx U4, Y, T7
195+
vstx U6, Y, T2
196+
vstx U8, Y, T3
197+
vstx U10, Y, T4
198+
.L01_Y_3:
199+
.endm
92200

93-
LDARG BUFFER, $sp, 0
201+
PROLOGUE
94202

95203
addi.d $sp, $sp, -88
96204

@@ -107,6 +215,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
107215

108216
vldrepl.d VALPHA, $sp, 80
109217

218+
addi.d T5, INCY, -1
219+
addi.d T6, INCX, -1
110220
slli.d LDA, LDA, BASE_SHIFT
111221
slli.d INCX, INCX, BASE_SHIFT
112222
slli.d INCY, INCY, BASE_SHIFT
@@ -122,11 +232,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
122232
beq J, N, .L999
123233

124234
.L01:
125-
MTC a2, $r0 //temp2
235+
vxor.v U2, U2, U2
126236
fldx.d a6, X, JX
127237
fmul.d a3, ALPHA, a6 //temp1
128238
vshuf4i.d U3, U3, 0x00
129-
vshuf4i.d U2, U2, 0x00
130239

131240
mul.d T0, J, LDA
132241
slli.d T1, J, BASE_SHIFT
@@ -163,105 +272,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
163272
vldx U16, AO1, T1
164273
addi.d T1, T1, 16
165274

166-
add.d T2, IY, INCY
167-
fldx.d $f4, Y, T2
168-
add.d T2, T2, INCY
169-
fldx.d $f5, Y, T2
170-
add.d T2, T2, INCY
171-
fldx.d $f6, Y, T2
172-
add.d T2, T2, INCY
173-
fldx.d $f7, Y, T2
174-
175-
add.d T2, T2, INCY
176-
fldx.d $f8, Y, T2
177-
add.d T2, T2, INCY
178-
fldx.d $f9, Y, T2
179-
add.d T2, T2, INCY
180-
fldx.d $f10, Y, T2
181-
add.d T2, T2, INCY
182-
fldx.d $f11, Y, T2
183-
184-
vextrins.d U4, U5, 0x10
185-
vextrins.d U6, U7, 0x10
186-
vextrins.d U8, U9, 0x10
187-
vextrins.d U10, U11, 0x10
275+
LOAD_Y_8
188276

189277
vfmadd.d U4, U3, U1, U4
190278
vfmadd.d U6, U3, U14, U6
191279
vfmadd.d U8, U3, U15, U8
192280
vfmadd.d U10, U3, U16, U10
193281

194-
vextrins.d U5, U4, 0x01
195-
vextrins.d U7, U6, 0x01
196-
vextrins.d U9, U8, 0x01
197-
vextrins.d U11, U10, 0x01
198-
199-
add.d T2, IY, INCY
200-
fstx.d $f4, Y, T2
201-
add.d T2, T2, INCY
202-
fstx.d $f5, Y, T2
203-
add.d T2, T2, INCY
204-
fstx.d $f6, Y, T2
205-
add.d T2, T2, INCY
206-
fstx.d $f7, Y, T2
207-
208-
add.d T2, T2, INCY
209-
fstx.d $f8, Y, T2
210-
add.d T2, T2, INCY
211-
fstx.d $f9, Y, T2
212-
add.d T2, T2, INCY
213-
fstx.d $f10, Y, T2
214-
add.d T2, T2, INCY
215-
fstx.d $f11, Y, T2
216-
217-
slli.d T2, INCY, 3
218-
add.d IY, IY, T2
219-
220-
add.d T2, IX, INCX
221-
fldx.d $f4, X, T2
222-
add.d T2, T2, INCX
223-
fldx.d $f5, X, T2
224-
add.d T2, T2, INCX
225-
fldx.d $f6, X, T2
226-
add.d T2, T2, INCX
227-
fldx.d $f7, X, T2
228-
229-
add.d T2, T2, INCX
230-
fldx.d $f8, X, T2
231-
add.d T2, T2, INCX
232-
fldx.d $f9, X, T2
233-
add.d T2, T2, INCX
234-
fldx.d $f10, X, T2
235-
add.d T2, T2, INCX
236-
fldx.d $f11, X, T2
282+
STORE_Y_8
237283

238-
vextrins.d U4, U5, 0x10
239-
vextrins.d U6, U7, 0x10
240-
vextrins.d U8, U9, 0x10
241-
vextrins.d U10, U11, 0x10
284+
alsl.d IY, INCY, IY, 3
242285

243-
vand.v $vr12, $vr2, $vr2
286+
LOAD_X_8
244287

245288
vfmadd.d U2, U1, U4, U2
246-
vfsub.d U2, U2, $vr12
247289
vfmadd.d U2, U14, U6, U2
248290
vfmadd.d U2, U15, U8, U2
249291
vfmadd.d U2, U16, U10, U2
250292

251-
vextrins.d U4, U2, 0x01
252-
253-
fadd.d $f2, $f2, $f4
254-
fadd.d $f2, $f2, $f12
255-
256-
vextrins.d U2, U2, 0x10
257-
258-
slli.d T2, INCX, 3
259-
add.d IX, IX, T2
293+
alsl.d IX, INCX, IX, 3
260294

261295
addi.d II, II, 64
262296
addi.d I, I, 1
263297
blt I, T0, .L02
264298

299+
// Acc U2
300+
GACC vf, d, U4, U2
301+
vilvl.d U2, U4, U4
302+
265303
.L03: /* &4 */
266304
sub.d T0, M, J
267305
addi.d T0, T0, -1
@@ -429,4 +467,4 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
429467
addi.d $sp, $sp, 88
430468
jirl $r0, $r1, 0x0
431469

432-
EPILOGUE
470+
EPILOGUE

0 commit comments

Comments
 (0)