@@ -33,6 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33
33
#define ALPHAI $f1
34
34
#define X $r7
35
35
#define INCX $r8
36
+ #define DUMMY2 $r9
36
37
37
38
#define I $r12
38
39
#define TEMP $r13
@@ -65,6 +66,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
65
66
66
67
bge $r0, N, .L999
67
68
bge $r0, INCX, .L999
69
+ ld.d DUMMY2, $sp, 0
68
70
li.d TEMP, 1
69
71
movgr2fr.d a1, $r0
70
72
FFINT a1, a1
@@ -84,24 +86,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
84
86
srai.d I, N, 2
85
87
bne INCX, TEMP, .L22
86
88
89
+ /////// INCX == 1 ////////
87
90
.L11:
88
- bge $r0, I, .L997
89
91
CMPEQ $fcc0, ALPHAR, a1
90
92
CMPEQ $fcc1, ALPHAI, a1
91
- bceqz $fcc0, .L13
92
- b .L14
93
- .align 3
93
+ bge $r0, I, .L19
94
94
95
- .L13:
96
- bceqz $fcc1, .L114 //alpha_r != 0.0 && alpha_i != 0.0
97
- b .L113 //alpha_r != 0.0 && alpha_i == 0.0
95
+ /////// INCX == 1 && N >= 4 ////////
96
+ bnez DUMMY2, .L17 // if DUMMPY2 == 1, called from c/zscal.
98
97
99
- .L14:
100
- bceqz $fcc1, .L114 //alpha_r == 0.0 && alpha_i != 0.0
101
- b .L111 //alpha_r == 0.0 && alpha_i == 0.0
102
- .align 3
98
+ bceqz $fcc0, .L17
103
99
104
- .L111: //alpha_r == 0.0 && alpha_i == 0.0
100
+ bceqz $fcc1, .L17
101
+
102
+ .L15: //alpha_r == 0.0 && alpha_i == 0.0
105
103
vst VXZ, X, 0 * SIZE
106
104
#ifdef DOUBLE
107
105
vst VXZ, X, 2 * SIZE
@@ -112,50 +110,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
112
110
#endif
113
111
addi.d X, X, 8 * SIZE
114
112
addi.d I, I, -1
115
- blt $r0, I, .L111
116
- b .L997
117
- .align 3
118
-
119
- .L113: //alpha_r != 0.0 && alpha_i == 0.0
120
- vld VX0, X, 0 * SIZE
121
- #ifdef DOUBLE
122
- vld VX1, X, 2 * SIZE
123
- vpickev.d x1, VX1, VX0
124
- vpickod.d x2, VX1, VX0
125
- vfmul.d x3, VXAR, x1
126
- vfmul.d x4, VXAR, x2
127
- vilvl.d VX2, x4 ,x3
128
- vilvh.d VX3, x4, x3
129
- vst VX2, X, 0 * SIZE
130
- vst VX3, X, 2 * SIZE
131
- vld VX0, X, 4 * SIZE
132
- vld VX1, X, 6 * SIZE
133
- vpickev.d x1, VX1, VX0
134
- vpickod.d x2, VX1, VX0
135
- vfmul.d x3, VXAR, x1
136
- vfmul.d x4, VXAR, x2
137
- vilvl.d VX2, x4 ,x3
138
- vilvh.d VX3, x4, x3
139
- vst VX2, X, 4 * SIZE
140
- vst VX3, X, 6 * SIZE
141
- #else
142
- vld VX1, X, 4 * SIZE
143
- vpickev.w x1, VX1, VX0
144
- vpickod.w x2, VX1, VX0
145
- vfmul.s x3, VXAR, x1
146
- vfmul.s x4, VXAR, x2
147
- vilvl.w VX2, x4 ,x3
148
- vilvh.w VX3, x4, x3
149
- vst VX2, X, 0 * SIZE
150
- vst VX3, X, 4 * SIZE
151
- #endif
152
- addi.d X, X, 8 * SIZE
153
- addi.d I, I, -1
154
- blt $r0, I, .L113
155
- b .L997
113
+ blt $r0, I, .L15
114
+ b .L19
156
115
.align 3
157
116
158
- .L114: //alpha_r != 0.0 && alpha_i != 0.0
117
+ .L17:
159
118
vld VX0, X, 0 * SIZE
160
119
#ifdef DOUBLE
161
120
vld VX1, X, 2 * SIZE
@@ -196,29 +155,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
196
155
#endif
197
156
addi.d X, X, 8 * SIZE
198
157
addi.d I, I, -1
199
- blt $r0, I, .L114
200
- b .L997
158
+ blt $r0, I, .L17
159
+ b .L19
201
160
.align 3
202
161
162
+ /////// INCX == 1 && N < 8 ///////
163
+ .L19:
164
+ andi I, N, 3
165
+ beqz I, .L999
166
+ bnez DUMMY2, .L998 // if DUMMPY2 == 1, called from c/zscal.
167
+
168
+ bceqz $fcc0, .L998
169
+
170
+ bceqz $fcc1, .L998
171
+
172
+ b .L995 // alpha_r == 0.0 && alpha_i == 0.0
173
+
174
+ /////// INCX != 1 ////////
203
175
.L22:
204
- bge $r0, I, .L997
205
- move XX, X
206
176
CMPEQ $fcc0, ALPHAR, a1
207
177
CMPEQ $fcc1, ALPHAI, a1
208
- bceqz $fcc0, .L23
209
- b .L24
210
- .align 3
178
+ move XX, X
179
+ bge $r0, I, .L29
180
+ bnez DUMMY2, .L25 // if DUMMPY2 == 1, called from c/zscal.
211
181
212
- .L23:
213
- bceqz $fcc1, .L224 //alpha_r != 0.0 && alpha_i != 0.0
214
- b .L223 //alpha_r != 0.0 && alpha_i == 0.0
182
+ bceqz $fcc0, .L25
215
183
216
- .L24:
217
- bceqz $fcc1, .L224 //alpha_r == 0.0 && alpha_i != 0.0
218
- b .L221 //alpha_r == 0.0 && alpha_i == 0.0
219
- .align 3
184
+ bceqz $fcc1, .L25
220
185
221
- .L221 : //alpha_r == 0.0 && alpha_i == 0.0
186
+ .L27 : //alpha_r == 0.0 && alpha_i == 0.0
222
187
#ifdef DOUBLE
223
188
vstelm.d VXZ, X, 0 , 0
224
189
vstelm.d VXZ, X, 1 * SIZE, 0
@@ -246,92 +211,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
246
211
#endif
247
212
add .d X, X, INCX
248
213
addi.d I, I, -1
249
- blt $r0, I, .L221
250
- b .L997
214
+ blt $r0, I, .L27
215
+ b .L29
251
216
.align 3
252
217
253
- .L223: //alpha_r != 0.0 && alpha_i == 0.0
254
- #ifdef DOUBLE
255
- ld.d t1, X, 0 * SIZE
256
- ld.d t2, X, 1 * SIZE
257
- add .d X, X, INCX
258
- ld.d t3, X, 0 * SIZE
259
- ld.d t4, X, 1 * SIZE
260
- add .d X, X, INCX
261
- vinsgr2vr.d x1, t1, 0
262
- vinsgr2vr.d x2, t2, 0
263
- vinsgr2vr.d x1, t3, 1
264
- vinsgr2vr.d x2, t4, 1
265
- vfmul.d x3, VXAR, x1
266
- vfmul.d x4, VXAR, x2
267
- vstelm.d x3, XX, 0 * SIZE, 0
268
- vstelm.d x4, XX, 1 * SIZE, 0
269
- add .d XX, XX, INCX
270
- vstelm.d x3, XX, 0 * SIZE, 1
271
- vstelm.d x4, XX, 1 * SIZE, 1
272
- add .d XX, XX, INCX
273
-
274
- ld.d t1, X, 0 * SIZE
275
- ld.d t2, X, 1 * SIZE
276
- add .d X, X, INCX
277
- ld.d t3, X, 0 * SIZE
278
- ld.d t4, X, 1 * SIZE
279
- vinsgr2vr.d x1, t1, 0
280
- vinsgr2vr.d x2, t2, 0
281
- vinsgr2vr.d x1, t3, 1
282
- vinsgr2vr.d x2, t4, 1
283
- add .d X, X, INCX
284
- vfmul.d x3, VXAR, x1
285
- vfmul.d x4, VXAR, x2
286
- addi.d I, I, -1
287
- vstelm.d x3, XX, 0 * SIZE, 0
288
- vstelm.d x4, XX, 1 * SIZE, 0
289
- add .d XX, XX, INCX
290
- vstelm.d x3, XX, 0 * SIZE, 1
291
- vstelm.d x4, XX, 1 * SIZE, 1
292
- #else
293
- ld.w t1, X, 0 * SIZE
294
- ld.w t2, X, 1 * SIZE
295
- add .d X, X, INCX
296
- ld.w t3, X, 0 * SIZE
297
- ld.w t4, X, 1 * SIZE
298
- add .d X, X, INCX
299
- vinsgr2vr.w x1, t1, 0
300
- vinsgr2vr.w x2, t2, 0
301
- vinsgr2vr.w x1, t3, 1
302
- vinsgr2vr.w x2, t4, 1
303
- ld.w t1, X, 0 * SIZE
304
- ld.w t2, X, 1 * SIZE
305
- add .d X, X, INCX
306
- ld.w t3, X, 0 * SIZE
307
- ld.w t4, X, 1 * SIZE
308
- vinsgr2vr.w x1, t1, 2
309
- vinsgr2vr.w x2, t2, 2
310
- vinsgr2vr.w x1, t3, 3
311
- vinsgr2vr.w x2, t4, 3
312
- add .d X, X, INCX
313
-
314
- vfmul.s x3, VXAR, x1
315
- vfmul.s x4, VXAR, x2
316
- addi.d I, I, -1
317
- vstelm.w x3, XX, 0 * SIZE, 0
318
- vstelm.w x4, XX, 1 * SIZE, 0
319
- add .d XX, XX, INCX
320
- vstelm.w x3, XX, 0 * SIZE, 1
321
- vstelm.w x4, XX, 1 * SIZE, 1
322
- add .d XX, XX, INCX
323
- vstelm.w x3, XX, 0 * SIZE, 2
324
- vstelm.w x4, XX, 1 * SIZE, 2
325
- add .d XX, XX, INCX
326
- vstelm.w x3, XX, 0 * SIZE, 3
327
- vstelm.w x4, XX, 1 * SIZE, 3
328
- #endif
329
- add .d XX, XX, INCX
330
- blt $r0, I, .L223
331
- b .L997
332
- .align 3
333
-
334
- .L224: //alpha_r != 0.0 && alpha_i != 0.0
218
+ .L25:
335
219
#ifdef DOUBLE
336
220
ld.d t1, X, 0 * SIZE
337
221
ld.d t2, X, 1 * SIZE
@@ -414,15 +298,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
414
298
vstelm.w x4, XX, 1 * SIZE, 3
415
299
#endif
416
300
add .d XX, XX, INCX
417
- blt $r0, I, .L224
418
- b .L997
301
+ blt $r0, I, .L25
302
+ b .L29
419
303
.align 3
420
304
421
- .L997:
422
- andi I, N, 3
423
- bge $r0, I, .L999
424
- .align 3
305
+ /////// INCX != 1 && N < 8 ///////
306
+ .L29:
307
+ andi I, N, 3
308
+ beqz I, .L999
309
+ bnez DUMMY2, .L998 // if DUMMPY2 == 1, called from c/zscal.
310
+
311
+ bceqz $fcc0, .L998
312
+
313
+ bceqz $fcc1, .L998
425
314
315
+ b .L995 // alpha_r == 0.0 && alpha_i == 0.0
316
+
317
+ .L995: // alpha_r == 0.0 && alpha_i == 0.0
318
+ ST a1, X, 0 * SIZE
319
+ ST a1, X, 1 * SIZE
320
+ addi.d I, I, -1
321
+ add .d X, X, INCX
322
+ blt $r0, I, .L995
323
+ b .L999
426
324
.L998:
427
325
LD a1, X, 0 * SIZE
428
326
LD a2, X, 1 * SIZE
@@ -435,7 +333,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
435
333
ST s2, X, 1 * SIZE
436
334
add .d X, X, INCX
437
335
blt $r0, I, .L998
438
- .align 3
336
+ b .L999
439
337
440
338
.L999:
441
339
move $r4, $r12
0 commit comments