@@ -28,6 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28
28
#define ASSEMBLER
29
29
30
30
#include "common.h"
31
+ #include "loongarch64_asm.S"
31
32
32
33
/* Param */
33
34
#define M $r4
@@ -57,6 +58,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
57
58
#define T2 $r28
58
59
#define T3 $r29
59
60
#define T4 $r30
61
+ #define T5 $r17
62
+ #define T6 $r16
60
63
61
64
/* LSX vectors */
62
65
#define U0 $xr31
@@ -87,10 +90,113 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
87
90
#define a8 $f8
88
91
#define a9 $f9
89
92
93
+ .macro LOAD_Y_8
94
+ beqz T5, .L01_Y_0
95
+ add .d T2, IY, INCY
96
+ fldx.d $f4, Y, T2
97
+ add .d T2, T2, INCY
98
+ fldx.d $f5, Y, T2
99
+ add .d T2, T2, INCY
100
+ fldx.d $f6, Y, T2
101
+ add .d T2, T2, INCY
102
+ fldx.d $f7, Y, T2
90
103
91
- PROLOGUE
104
+ add .d T2, T2, INCY
105
+ fldx.d $f8, Y, T2
106
+ add .d T2, T2, INCY
107
+ fldx.d $f9, Y, T2
108
+ add .d T2, T2, INCY
109
+ fldx.d $f10, Y, T2
110
+ add .d T2, T2, INCY
111
+ fldx.d $f11, Y, T2
112
+
113
+ vextrins.d $vr4, $vr5, 0x10
114
+ vextrins.d $vr6, $vr7, 0x10
115
+ xvpermi.q U4, U6, 0x02
92
116
93
- LDARG BUFFER, $sp, 0
117
+ vextrins.d $vr8, $vr9, 0x10
118
+ vextrins.d $vr10, $vr11, 0x10
119
+ xvpermi.q U8, U10, 0x02
120
+ b .L01_Y_1
121
+ .L01_Y_0:
122
+ add .d T3, IY, INCY
123
+ xvldx U4, Y, T3
124
+ alsl.d T4, INCY, T3, 2
125
+ xvldx U8, Y, T4
126
+ .L01_Y_1:
127
+ .endm
128
+
129
+ .macro LOAD_X_8
130
+ beqz T6, .L01_X_0
131
+ add .d T2, IX, INCX
132
+ fldx.d $f4, X, T2
133
+ add .d T2, T2, INCX
134
+ fldx.d $f5, X, T2
135
+ add .d T2, T2, INCX
136
+ fldx.d $f6, X, T2
137
+ add .d T2, T2, INCX
138
+ fldx.d $f7, X, T2
139
+
140
+ add .d T2, T2, INCX
141
+ fldx.d $f8, X, T2
142
+ add .d T2, T2, INCX
143
+ fldx.d $f9, X, T2
144
+ add .d T2, T2, INCX
145
+ fldx.d $f10, X, T2
146
+ add .d T2, T2, INCX
147
+ fldx.d $f11, X, T2
148
+
149
+ vextrins.d $vr4, $vr5, 0x10
150
+ vextrins.d $vr6, $vr7, 0x10
151
+ xvpermi.q U4, U6, 0x02
152
+
153
+ vextrins.d $vr8, $vr9, 0x10
154
+ vextrins.d $vr10, $vr11, 0x10
155
+ xvpermi.q U8, U10, 0x02
156
+ b .L01_X_1
157
+ .L01_X_0:
158
+ add .d T3, IX, INCX
159
+ xvldx U4, X, T3
160
+ alsl.d T2, INCX, T3, 2
161
+ xvldx U8, X, T2
162
+ .L01_X_1:
163
+ .endm
164
+
165
+ .macro STORE_Y_8
166
+ beqz T5, .L01_Y_2
167
+ xvpermi.d U6, U4, 0xee
168
+ vextrins.d $vr5, $vr4, 0x01
169
+ vextrins.d $vr7, $vr6, 0x01
170
+
171
+ xvpermi.d U10, U8, 0xee
172
+ vextrins.d $vr9, $vr8, 0x01
173
+ vextrins.d $vr11, $vr10, 0x01
174
+
175
+ add .d T2, IY, INCY
176
+ fstx.d $f4, Y, T2
177
+ add .d T2, T2, INCY
178
+ fstx.d $f5, Y, T2
179
+ add .d T2, T2, INCY
180
+ fstx.d $f6, Y, T2
181
+ add .d T2, T2, INCY
182
+ fstx.d $f7, Y, T2
183
+
184
+ add .d T2, T2, INCY
185
+ fstx.d $f8, Y, T2
186
+ add .d T2, T2, INCY
187
+ fstx.d $f9, Y, T2
188
+ add .d T2, T2, INCY
189
+ fstx.d $f10, Y, T2
190
+ add .d T2, T2, INCY
191
+ fstx.d $f11, Y, T2
192
+ b .L01_Y_3
193
+ .L01_Y_2:
194
+ xvstx U4, Y, T3
195
+ xvstx U8, Y, T4
196
+ .L01_Y_3:
197
+ .endm
198
+
199
+ PROLOGUE
94
200
95
201
addi.d $sp, $sp, -88
96
202
@@ -107,6 +213,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
107
213
108
214
xvldrepl.d VALPHA, $sp, 80
109
215
216
+ addi.d T5, INCY, -1
217
+ addi.d T6, INCX, -1
110
218
slli.d LDA, LDA, BASE_SHIFT
111
219
slli.d INCX, INCX, BASE_SHIFT
112
220
slli.d INCY, INCY, BASE_SHIFT
@@ -122,11 +230,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
122
230
beq J, N, .L999
123
231
124
232
.L01:
125
- MTC a2 , $r0 //temp2
233
+ xvxor.v U2, U2 , U2
126
234
fldx.d a6, X, JX
127
235
fmul .d a3, ALPHA, a6 //temp1
128
236
xvreplve0.d U3, U3
129
- xvreplve0.d U2, U2
130
237
131
238
mul .d T0, J, LDA
132
239
slli.d T1, J, BASE_SHIFT
@@ -147,126 +254,41 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
147
254
srai.d T0, T0, 3
148
255
add .d T0, T0, J
149
256
addi.d T0, T0, 1
150
- beq I, T0, .L03
151
- bge I, T0, .L03
257
+ beq I, T0, .L03
258
+ bge I, T0, .L03
152
259
153
260
mul .d T1, J, LDA
154
261
add .d T1, T1, II
155
262
156
263
.L02: /* /8 */
157
264
xvldx U1, AO1, T1
158
- addi.d T1, T1, 32
159
- xvldx U14, AO1, T1
160
- addi.d T1, T1, 32
265
+ addi.d T2, T1, 32
266
+ xvldx U14, AO1, T2
161
267
162
- add .d T2, IY, INCY
163
- fldx.d $f4, Y, T2
164
- add .d T2, T2, INCY
165
- fldx.d $f5, Y, T2
166
- add .d T2, T2, INCY
167
- fldx.d $f6, Y, T2
168
- add .d T2, T2, INCY
169
- fldx.d $f7, Y, T2
170
-
171
- add .d T2, T2, INCY
172
- fldx.d $f8, Y, T2
173
- add .d T2, T2, INCY
174
- fldx.d $f9, Y, T2
175
- add .d T2, T2, INCY
176
- fldx.d $f10, Y, T2
177
- add .d T2, T2, INCY
178
- fldx.d $f11, Y, T2
179
-
180
- vextrins.d $vr4, $vr5, 0x10
181
- vextrins.d $vr6, $vr7, 0x10
182
- xvpermi.q U4, U6, 0x02
183
-
184
- vextrins.d $vr8, $vr9, 0x10
185
- vextrins.d $vr10, $vr11, 0x10
186
- xvpermi.q U8, U10, 0x02
268
+ LOAD_Y_8
187
269
188
270
xvfmadd.d U4, U3, U1, U4
189
271
xvfmadd.d U8, U3, U14, U8
190
272
191
- xvpermi.d U6, U4, 0xee
192
- vextrins.d $vr5, $vr4, 0x01
193
- vextrins.d $vr7, $vr6, 0x01
194
-
195
- xvpermi.d U10, U8, 0xee
196
- vextrins.d $vr9, $vr8, 0x01
197
- vextrins.d $vr11, $vr10, 0x01
198
-
199
- add .d T2, IY, INCY
200
- fstx.d $f4, Y, T2
201
- add .d T2, T2, INCY
202
- fstx.d $f5, Y, T2
203
- add .d T2, T2, INCY
204
- fstx.d $f6, Y, T2
205
- add .d T2, T2, INCY
206
- fstx.d $f7, Y, T2
207
-
208
- add .d T2, T2, INCY
209
- fstx.d $f8, Y, T2
210
- add .d T2, T2, INCY
211
- fstx.d $f9, Y, T2
212
- add .d T2, T2, INCY
213
- fstx.d $f10, Y, T2
214
- add .d T2, T2, INCY
215
- fstx.d $f11, Y, T2
216
-
217
- slli.d T2, INCY, 3
218
- add .d IY, IY, T2
219
-
220
- add .d T2, IX, INCX
221
- fldx.d $f4, X, T2
222
- add .d T2, T2, INCX
223
- fldx.d $f5, X, T2
224
- add .d T2, T2, INCX
225
- fldx.d $f6, X, T2
226
- add .d T2, T2, INCX
227
- fldx.d $f7, X, T2
228
-
229
- add .d T2, T2, INCX
230
- fldx.d $f8, X, T2
231
- add .d T2, T2, INCX
232
- fldx.d $f9, X, T2
233
- add .d T2, T2, INCX
234
- fldx.d $f10, X, T2
235
- add .d T2, T2, INCX
236
- fldx.d $f11, X, T2
237
-
238
- vextrins.d $vr4, $vr5, 0x10
239
- vextrins.d $vr6, $vr7, 0x10
240
- xvpermi.q U4, U6, 0x02
241
-
242
- vextrins.d $vr8, $vr9, 0x10
243
- vextrins.d $vr10, $vr11, 0x10
244
- xvpermi.q U8, U10, 0x02
245
-
246
- xvand.v $xr12, $xr2, $xr2
247
-
248
- xvfmadd.d U2, U1, U4, U2
249
- xvfsub.d U2, U2, $xr12
250
- xvfmadd.d U2, U14, U8, U2
273
+ STORE_Y_8
251
274
252
- xvpermi.d U4, U2, 0x01
253
- xvpermi.d U5, U2, 0x02
254
- xvpermi.d U6, U2, 0x03
275
+ alsl.d IY, INCY, IY, 3
255
276
256
- fadd .d $f2, $f2, $f4
257
- fadd .d $f2, $f2, $f5
258
- fadd .d $f2, $f2, $f6
259
- fadd .d $f2, $f2, $f12
277
+ LOAD_X_8
260
278
261
- xvreplve0.d U2, U2
279
+ xvfmadd.d U2, U1, U4, U2
280
+ xvfmadd.d U2, U14, U8, U2
262
281
263
- slli.d T2, INCX, 3
264
- add .d IX, IX, T2
282
+ alsl.d IX, INCX, IX, 3
265
283
284
+ addi.d T1, T1, 64
266
285
addi.d II, II, 64
267
286
addi.d I, I, 1
268
287
blt I, T0, .L02
269
288
289
+ //Acc U2
290
+ GACC xvf, d, U4, U2
291
+ fmov.d $f2, $f4
270
292
.L03: /* &4 */
271
293
sub .d T0, M, J
272
294
addi.d T0, T0, -1
@@ -437,4 +459,4 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
437
459
addi.d $sp, $sp, 88
438
460
jirl $r0, $r1, 0x0
439
461
440
- EPILOGUE
462
+ EPILOGUE
0 commit comments