@@ -28,6 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28
28
#define ASSEMBLER
29
29
30
30
#include "common.h"
31
+ #include "loongarch64_asm.S"
31
32
32
33
/* Param */
33
34
#define M $r4
@@ -57,6 +58,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
57
58
#define T2 $r28
58
59
#define T3 $r29
59
60
#define T4 $r30
61
+ #define T5 $r17
62
+ #define T6 $r16
63
+ #define T7 $r12
60
64
61
65
/* LSX vectors */
62
66
#define U0 $vr31
@@ -87,10 +91,114 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
87
91
#define a8 $f8
88
92
#define a9 $f9
89
93
94
+ .macro LOAD_Y_8
95
+ beqz T5, .L01_Y_0
96
+ add .d T2, IY, INCY
97
+ fldx.d $f4, Y, T2
98
+ add .d T2, T2, INCY
99
+ fldx.d $f5, Y, T2
100
+ add .d T2, T2, INCY
101
+ fldx.d $f6, Y, T2
102
+ add .d T2, T2, INCY
103
+ fldx.d $f7, Y, T2
90
104
91
- PROLOGUE
105
+ add .d T2, T2, INCY
106
+ fldx.d $f8, Y, T2
107
+ add .d T2, T2, INCY
108
+ fldx.d $f9, Y, T2
109
+ add .d T2, T2, INCY
110
+ fldx.d $f10, Y, T2
111
+ add .d T2, T2, INCY
112
+ fldx.d $f11, Y, T2
113
+
114
+ vextrins.d U4, U5, 0x10
115
+ vextrins.d U6, U7, 0x10
116
+ vextrins.d U8, U9, 0x10
117
+ vextrins.d U10, U11, 0x10
118
+ b .L01_Y_1
119
+ .L01_Y_0:
120
+ add .d T7, IY, INCY
121
+ vldx U4, Y, T7
122
+ alsl.d T2, INCY, T7, 1
123
+ vldx U6, Y, T2
124
+ alsl.d T3, INCY, T2, 1
125
+ vldx U8, Y, T3
126
+ alsl.d T4, INCY, T3, 1
127
+ vldx U10, Y, T4
128
+ .L01_Y_1:
129
+ .endm
130
+
131
+ .macro LOAD_X_8
132
+ beqz T6, .L01_X_0
133
+ add .d T2, IX, INCX
134
+ fldx.d $f4, X, T2
135
+ add .d T2, T2, INCX
136
+ fldx.d $f5, X, T2
137
+ add .d T2, T2, INCX
138
+ fldx.d $f6, X, T2
139
+ add .d T2, T2, INCX
140
+ fldx.d $f7, X, T2
141
+
142
+ add .d T2, T2, INCX
143
+ fldx.d $f8, X, T2
144
+ add .d T2, T2, INCX
145
+ fldx.d $f9, X, T2
146
+ add .d T2, T2, INCX
147
+ fldx.d $f10, X, T2
148
+ add .d T2, T2, INCX
149
+ fldx.d $f11, X, T2
150
+
151
+ vextrins.d U4, U5, 0x10
152
+ vextrins.d U6, U7, 0x10
153
+ vextrins.d U8, U9, 0x10
154
+ vextrins.d U10, U11, 0x10
155
+ b .L01_X_1
156
+ .L01_X_0:
157
+ add .d T7, IX, INCX
158
+ vldx U4, X, T7
159
+ alsl.d T2, INCX, T7, 1
160
+ vldx U6, X, T2
161
+ alsl.d T3, INCX, T2, 1
162
+ vldx U8, X, T3
163
+ alsl.d T4, INCX, T3, 1
164
+ vldx U10, X, T4
165
+ .L01_X_1:
166
+ .endm
167
+
168
+ .macro STORE_Y_8
169
+ beqz T5, .L01_Y_2
170
+ vextrins.d U5, U4, 0x01
171
+ vextrins.d U7, U6, 0x01
172
+ vextrins.d U9, U8, 0x01
173
+ vextrins.d U11, U10, 0x01
174
+
175
+ add .d T2, IY, INCY
176
+ fstx.d $f4, Y, T2
177
+ add .d T2, T2, INCY
178
+ fstx.d $f5, Y, T2
179
+ add .d T2, T2, INCY
180
+ fstx.d $f6, Y, T2
181
+ add .d T2, T2, INCY
182
+ fstx.d $f7, Y, T2
183
+
184
+ add .d T2, T2, INCY
185
+ fstx.d $f8, Y, T2
186
+ add .d T2, T2, INCY
187
+ fstx.d $f9, Y, T2
188
+ add .d T2, T2, INCY
189
+ fstx.d $f10, Y, T2
190
+ add .d T2, T2, INCY
191
+ fstx.d $f11, Y, T2
192
+ b .L01_Y_3
193
+ .L01_Y_2:
194
+ vstx U4, Y, T7
195
+ vstx U6, Y, T2
196
+ vstx U8, Y, T3
197
+ vstx U10, Y, T4
198
+ .L01_Y_3:
199
+ .endm
92
200
93
- LDARG BUFFER, $sp, 0
201
+ PROLOGUE
94
202
95
203
addi.d $sp, $sp, -88
96
204
@@ -107,6 +215,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
107
215
108
216
vldrepl.d VALPHA, $sp, 80
109
217
218
+ addi.d T5, INCY, -1
219
+ addi.d T6, INCX, -1
110
220
slli.d LDA, LDA, BASE_SHIFT
111
221
slli.d INCX, INCX, BASE_SHIFT
112
222
slli.d INCY, INCY, BASE_SHIFT
@@ -122,11 +232,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
122
232
beq J, N, .L999
123
233
124
234
.L01:
125
- MTC a2 , $r0 //temp2
235
+ vxor.v U2, U2 , U2
126
236
fldx.d a6, X, JX
127
237
fmul .d a3, ALPHA, a6 //temp1
128
238
vshuf4i.d U3, U3, 0x00
129
- vshuf4i.d U2, U2, 0x00
130
239
131
240
mul .d T0, J, LDA
132
241
slli.d T1, J, BASE_SHIFT
@@ -163,105 +272,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
163
272
vldx U16, AO1, T1
164
273
addi.d T1, T1, 16
165
274
166
- add .d T2, IY, INCY
167
- fldx.d $f4, Y, T2
168
- add .d T2, T2, INCY
169
- fldx.d $f5, Y, T2
170
- add .d T2, T2, INCY
171
- fldx.d $f6, Y, T2
172
- add .d T2, T2, INCY
173
- fldx.d $f7, Y, T2
174
-
175
- add .d T2, T2, INCY
176
- fldx.d $f8, Y, T2
177
- add .d T2, T2, INCY
178
- fldx.d $f9, Y, T2
179
- add .d T2, T2, INCY
180
- fldx.d $f10, Y, T2
181
- add .d T2, T2, INCY
182
- fldx.d $f11, Y, T2
183
-
184
- vextrins.d U4, U5, 0x10
185
- vextrins.d U6, U7, 0x10
186
- vextrins.d U8, U9, 0x10
187
- vextrins.d U10, U11, 0x10
275
+ LOAD_Y_8
188
276
189
277
vfmadd.d U4, U3, U1, U4
190
278
vfmadd.d U6, U3, U14, U6
191
279
vfmadd.d U8, U3, U15, U8
192
280
vfmadd.d U10, U3, U16, U10
193
281
194
- vextrins.d U5, U4, 0x01
195
- vextrins.d U7, U6, 0x01
196
- vextrins.d U9, U8, 0x01
197
- vextrins.d U11, U10, 0x01
198
-
199
- add .d T2, IY, INCY
200
- fstx.d $f4, Y, T2
201
- add .d T2, T2, INCY
202
- fstx.d $f5, Y, T2
203
- add .d T2, T2, INCY
204
- fstx.d $f6, Y, T2
205
- add .d T2, T2, INCY
206
- fstx.d $f7, Y, T2
207
-
208
- add .d T2, T2, INCY
209
- fstx.d $f8, Y, T2
210
- add .d T2, T2, INCY
211
- fstx.d $f9, Y, T2
212
- add .d T2, T2, INCY
213
- fstx.d $f10, Y, T2
214
- add .d T2, T2, INCY
215
- fstx.d $f11, Y, T2
216
-
217
- slli.d T2, INCY, 3
218
- add .d IY, IY, T2
219
-
220
- add .d T2, IX, INCX
221
- fldx.d $f4, X, T2
222
- add .d T2, T2, INCX
223
- fldx.d $f5, X, T2
224
- add .d T2, T2, INCX
225
- fldx.d $f6, X, T2
226
- add .d T2, T2, INCX
227
- fldx.d $f7, X, T2
228
-
229
- add .d T2, T2, INCX
230
- fldx.d $f8, X, T2
231
- add .d T2, T2, INCX
232
- fldx.d $f9, X, T2
233
- add .d T2, T2, INCX
234
- fldx.d $f10, X, T2
235
- add .d T2, T2, INCX
236
- fldx.d $f11, X, T2
282
+ STORE_Y_8
237
283
238
- vextrins.d U4, U5, 0x10
239
- vextrins.d U6, U7, 0x10
240
- vextrins.d U8, U9, 0x10
241
- vextrins.d U10, U11, 0x10
284
+ alsl.d IY, INCY, IY, 3
242
285
243
- vand.v $vr12, $vr2, $vr2
286
+ LOAD_X_8
244
287
245
288
vfmadd.d U2, U1, U4, U2
246
- vfsub.d U2, U2, $vr12
247
289
vfmadd.d U2, U14, U6, U2
248
290
vfmadd.d U2, U15, U8, U2
249
291
vfmadd.d U2, U16, U10, U2
250
292
251
- vextrins.d U4, U2, 0x01
252
-
253
- fadd .d $f2, $f2, $f4
254
- fadd .d $f2, $f2, $f12
255
-
256
- vextrins.d U2, U2, 0x10
257
-
258
- slli.d T2, INCX, 3
259
- add .d IX, IX, T2
293
+ alsl.d IX, INCX, IX, 3
260
294
261
295
addi.d II, II, 64
262
296
addi.d I, I, 1
263
297
blt I, T0, .L02
264
298
299
+ // Acc U2
300
+ GACC vf, d, U4, U2
301
+ vilvl.d U2, U4, U4
302
+
265
303
.L03: /* &4 */
266
304
sub .d T0, M, J
267
305
addi.d T0, T0, -1
@@ -429,4 +467,4 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
429
467
addi.d $sp, $sp, 88
430
468
jirl $r0, $r1, 0x0
431
469
432
- EPILOGUE
470
+ EPILOGUE
0 commit comments