@@ -28,6 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28
28
#define ASSEMBLER
29
29
30
30
#include "common.h"
31
+ #include "loongarch64_asm.S"
31
32
32
33
/* Param */
33
34
#define M $r4
@@ -57,6 +58,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
57
58
#define T2 $r28
58
59
#define T3 $r29
59
60
#define T4 $r30
61
+ #define T5 $r17
62
+ #define T6 $r16
60
63
61
64
/* LSX vectors */
62
65
#define U0 $vr31
@@ -88,77 +91,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
88
91
#define a9 $f9
89
92
90
93
91
- PROLOGUE
92
-
93
- LDARG BUFFER, $sp , 0
94
-
95
- addi .d $sp , $sp , -88
96
-
97
- SDARG $r23, $sp , 0
98
- SDARG $r24, $sp , 8
99
- SDARG $r25, $sp , 16
100
- SDARG $r26, $sp , 32
101
- SDARG $r27, $sp , 40
102
- SDARG $r28, $sp , 48
103
- SDARG $r29, $sp , 56
104
- SDARG $r30, $sp , 64
105
- SDARG $r31, $sp , 72
106
- ST ALPHA, $sp , 80
107
-
108
- vldrepl.w VALPHA, $sp , 80
109
-
110
- slli.d LDA, LDA, BASE_SHIFT
111
- slli.d INCX, INCX, BASE_SHIFT
112
- slli.d INCY, INCY, BASE_SHIFT
113
-
114
- bge $r0, M, .L999
115
- bge $r0, N, .L999
116
-
117
- move J, $r0
118
- move JY, $r0
119
- move JX, $r0
120
- move AO1, A
121
-
122
- beq J , N, .L999
123
-
124
- .L01:
125
- MTC a2 , $r0 //temp2
126
- fldx.s a6 , X, JX
127
- fmul .s a3, ALPHA, a6 //temp1
128
- vpermi.w U3, U3, 0x00
129
- vpermi.w U2, U2, 0x00
130
-
131
- mul.w T0, J, LDA
132
- slli.d T1, J, BASE_SHIFT
133
- add.w T0, T0, T1
134
- fldx.s a6 , AO1, T0
135
- fldx.s a4 , Y, JY
136
- fmadd.s a4 , a3 , a6 , a4
137
- fstx.s a4 , Y, JY
138
-
139
- move IY, JY
140
- move IX, JX
141
- addi .d II, J, 1
142
- move I, II
143
- slli.d II, II, BASE_SHIFT
144
-
145
- sub .d T0, M, J
146
- addi .d T0, T0, -1
147
- srai.d T0, T0, 3
148
- add .d T0, T0, J
149
- addi .d T0, T0, 1
150
- beq I , T0, .L03
151
- bge I , T0, .L03
152
-
153
- mul.w T1, J, LDA
154
- add .d T1, T1, II
155
-
156
- .L02: /* /8 */
157
- vldx U1, AO1, T1
158
- addi .d T1, T1, 16
159
- vldx U14, AO1, T1
160
- addi .d T1, T1, 16
161
-
94
+ .macro LOAD_Y_8
95
+ beqz T5, .L01_Y_0
162
96
add .d T2, IY, INCY
163
97
fldx.s $f4 , Y, T2
164
98
add .d T2, T2, INCY
@@ -183,10 +117,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
183
117
vextrins.w U8, U9, 0x10
184
118
vextrins.w U8, U10, 0x20
185
119
vextrins.w U8, U11, 0x30
186
-
187
- vfmadd.s U4, U3, U1, U4
188
- vfmadd.s U8, U3, U14, U8
189
-
120
+ b .L01_Y_1
121
+ .L01_Y_0:
122
+ add .d T3, IY, INCY
123
+ vldx U4, Y, T3
124
+ alsl.d T4, INCY, T3, 2
125
+ vldx U8, Y, T4
126
+ .L01_Y_1:
127
+ .endm
128
+
129
+ .macro STORE_Y_8
130
+ beqz T5, .L01_Y_2
190
131
vextrins.w U5, U4, 0x01
191
132
vextrins.w U6, U4, 0x02
192
133
vextrins.w U7, U4, 0x03
@@ -211,10 +152,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
211
152
fstx.s $f10 , Y, T2
212
153
add .d T2, T2, INCY
213
154
fstx.s $f11 , Y, T2
214
-
215
- slli.d T2, INCY, 3
216
- add .d IY, IY, T2
217
-
155
+ b .L01_Y_3
156
+ .L01_Y_2:
157
+ vstx U4, Y, T3
158
+ vstx U8, Y, T4
159
+ .L01_Y_3:
160
+ .endm
161
+
162
+ .macro LOAD_X_8
163
+ beqz T6, .L01_X_0
218
164
add .d T2, IX, INCX
219
165
fldx.s $f4 , X, T2
220
166
add .d T2, T2, INCX
@@ -239,31 +185,109 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
239
185
vextrins.w $vr8, $vr9, 0x10
240
186
vextrins.w $vr8, $vr10, 0x20
241
187
vextrins.w $vr8, $vr11, 0x30
188
+ b .L01_X_1
189
+ .L01_X_0:
190
+ add .d T3, IX, INCX
191
+ vldx U4, X, T3
192
+ alsl.d T4, INCX, T3, 2
193
+ vldx U8, X, T4
194
+ .L01_X_1:
195
+ .endm
242
196
243
- vand.v $vr12, $vr2, $vr2
197
+ PROLOGUE
244
198
245
- vfmadd.s U2, U1, U4, U2
246
- vfsub.s U2, U2, $vr12
247
- vfmadd.s U2, U14, U8, U2
199
+ addi .d $sp , $sp , -88
248
200
249
- vextrins.w U4, U2, 0x01
250
- vextrins.w U5, U2, 0x02
251
- vextrins.w U6, U2, 0x03
201
+ SDARG $r23, $sp , 0
202
+ SDARG $r24, $sp , 8
203
+ SDARG $r25, $sp , 16
204
+ SDARG $r26, $sp , 32
205
+ SDARG $r27, $sp , 40
206
+ SDARG $r28, $sp , 48
207
+ SDARG $r29, $sp , 56
208
+ SDARG $r30, $sp , 64
209
+ SDARG $r31, $sp , 72
210
+ ST ALPHA, $sp , 80
252
211
253
- fadd .s $f2, $f2, $f4
254
- fadd .s $f2, $f2, $f5
255
- fadd .s $f2, $f2, $f6
256
- fadd .s $f2, $f2, $f12
212
+ vldrepl.w VALPHA, $sp , 80
257
213
258
- vpermi.w U2, U2, 0x00
214
+ addi .d T5, INCY, -1
215
+ addi .d T6, INCX, -1
216
+ slli.d LDA, LDA, BASE_SHIFT
217
+ slli.d INCX, INCX, BASE_SHIFT
218
+ slli.d INCY, INCY, BASE_SHIFT
219
+
220
+ bge $r0, M, .L999
221
+ bge $r0, N, .L999
222
+
223
+ move J, $r0
224
+ move JY, $r0
225
+ move JX, $r0
226
+ move AO1, A
227
+
228
+ beq J , N, .L999
229
+
230
+ .L01:
231
+ vxor.v U2, U2, U2
232
+ fldx.s a6 , X, JX
233
+ fmul .s a3, ALPHA, a6 //temp1
234
+ vpermi.w U3, U3, 0x00
235
+
236
+ mul.w T0, J, LDA
237
+ slli.d T1, J, BASE_SHIFT
238
+ add.w T0, T0, T1
239
+ fldx.s a6 , AO1, T0
240
+ fldx.s a4 , Y, JY
241
+ fmadd.s a4 , a3 , a6 , a4
242
+ fstx.s a4 , Y, JY
243
+
244
+ move IY, JY
245
+ move IX, JX
246
+ addi .d II, J, 1
247
+ move I, II
248
+ slli.d II, II, BASE_SHIFT
259
249
260
- slli.d T2, INCX, 3
261
- add .d IX, IX, T2
250
+ sub .d T0, M, J
251
+ addi .d T0, T0, -1
252
+ srai.d T0, T0, 3
253
+ add .d T0, T0, J
254
+ addi .d T0, T0, 1
255
+ beq I , T0, .L03
256
+ bge I , T0, .L03
257
+
258
+ mul.w T1, J, LDA
259
+ add .d T1, T1, II
260
+
261
+ .L02: /* /8 */
262
+ vldx U1, AO1, T1
263
+ addi .d T1, T1, 16
264
+ vldx U14, AO1, T1
265
+ addi .d T1, T1, 16
266
+
267
+ LOAD_Y_8
268
+
269
+ vfmadd.s U4, U3, U1, U4
270
+ vfmadd.s U8, U3, U14, U8
271
+
272
+ STORE_Y_8
273
+
274
+ alsl.d IY, INCY, IY, 3
275
+
276
+ LOAD_X_8
277
+
278
+ vfmadd.s U2, U1, U4, U2
279
+ vfmadd.s U2, U14, U8, U2
280
+
281
+ alsl.d IX, INCX, IX, 3
262
282
263
283
addi .d II, II, 32
264
284
addi .d I, I, 1
265
285
blt I , T0, .L02
266
286
287
+ // Acc U2
288
+ GACC vf, s, U4, U2
289
+ vpermi.w U2, U4, 0
290
+
267
291
.L03: /* &4 */
268
292
sub .d T0, M, J
269
293
addi .d T0, T0, -1
@@ -426,4 +450,4 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
426
450
addi .d $sp , $sp , 88
427
451
jirl $r0, $r1, 0x0
428
452
429
- EPILOGUE
453
+ EPILOGUE
0 commit comments