Skip to content

Commit b2117bb

Browse files
committed
LoongArch64: Fixed LSX version of cscal and zscal
1 parent 7c3a920 commit b2117bb

File tree

1 file changed

+58
-160
lines changed

1 file changed

+58
-160
lines changed

kernel/loongarch64/cscal_lsx.S

Lines changed: 58 additions & 160 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
3333
#define ALPHAI $f1
3434
#define X $r7
3535
#define INCX $r8
36+
#define DUMMY2 $r9
3637

3738
#define I $r12
3839
#define TEMP $r13
@@ -65,6 +66,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
6566

6667
bge $r0, N, .L999
6768
bge $r0, INCX, .L999
69+
ld.d DUMMY2, $sp, 0
6870
li.d TEMP, 1
6971
movgr2fr.d a1, $r0
7072
FFINT a1, a1
@@ -84,24 +86,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
8486
srai.d I, N, 2
8587
bne INCX, TEMP, .L22
8688

89+
/////// INCX == 1 ////////
8790
.L11:
88-
bge $r0, I, .L997
8991
CMPEQ $fcc0, ALPHAR, a1
9092
CMPEQ $fcc1, ALPHAI, a1
91-
bceqz $fcc0, .L13
92-
b .L14
93-
.align 3
93+
bge $r0, I, .L19
9494

95-
.L13:
96-
bceqz $fcc1, .L114 //alpha_r != 0.0 && alpha_i != 0.0
97-
b .L113 //alpha_r != 0.0 && alpha_i == 0.0
95+
/////// INCX == 1 && N >= 4 ////////
96+
bnez DUMMY2, .L17 // if DUMMPY2 == 1, called from c/zscal.
9897

99-
.L14:
100-
bceqz $fcc1, .L114 //alpha_r == 0.0 && alpha_i != 0.0
101-
b .L111 //alpha_r == 0.0 && alpha_i == 0.0
102-
.align 3
98+
bceqz $fcc0, .L17
10399

104-
.L111: //alpha_r == 0.0 && alpha_i == 0.0
100+
bceqz $fcc1, .L17
101+
102+
.L15: //alpha_r == 0.0 && alpha_i == 0.0
105103
vst VXZ, X, 0 * SIZE
106104
#ifdef DOUBLE
107105
vst VXZ, X, 2 * SIZE
@@ -112,50 +110,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
112110
#endif
113111
addi.d X, X, 8 * SIZE
114112
addi.d I, I, -1
115-
blt $r0, I, .L111
116-
b .L997
117-
.align 3
118-
119-
.L113: //alpha_r != 0.0 && alpha_i == 0.0
120-
vld VX0, X, 0 * SIZE
121-
#ifdef DOUBLE
122-
vld VX1, X, 2 * SIZE
123-
vpickev.d x1, VX1, VX0
124-
vpickod.d x2, VX1, VX0
125-
vfmul.d x3, VXAR, x1
126-
vfmul.d x4, VXAR, x2
127-
vilvl.d VX2, x4 ,x3
128-
vilvh.d VX3, x4, x3
129-
vst VX2, X, 0 * SIZE
130-
vst VX3, X, 2 * SIZE
131-
vld VX0, X, 4 * SIZE
132-
vld VX1, X, 6 * SIZE
133-
vpickev.d x1, VX1, VX0
134-
vpickod.d x2, VX1, VX0
135-
vfmul.d x3, VXAR, x1
136-
vfmul.d x4, VXAR, x2
137-
vilvl.d VX2, x4 ,x3
138-
vilvh.d VX3, x4, x3
139-
vst VX2, X, 4 * SIZE
140-
vst VX3, X, 6 * SIZE
141-
#else
142-
vld VX1, X, 4 * SIZE
143-
vpickev.w x1, VX1, VX0
144-
vpickod.w x2, VX1, VX0
145-
vfmul.s x3, VXAR, x1
146-
vfmul.s x4, VXAR, x2
147-
vilvl.w VX2, x4 ,x3
148-
vilvh.w VX3, x4, x3
149-
vst VX2, X, 0 * SIZE
150-
vst VX3, X, 4 * SIZE
151-
#endif
152-
addi.d X, X, 8 * SIZE
153-
addi.d I, I, -1
154-
blt $r0, I, .L113
155-
b .L997
113+
blt $r0, I, .L15
114+
b .L19
156115
.align 3
157116

158-
.L114: //alpha_r != 0.0 && alpha_i != 0.0
117+
.L17:
159118
vld VX0, X, 0 * SIZE
160119
#ifdef DOUBLE
161120
vld VX1, X, 2 * SIZE
@@ -196,29 +155,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
196155
#endif
197156
addi.d X, X, 8 * SIZE
198157
addi.d I, I, -1
199-
blt $r0, I, .L114
200-
b .L997
158+
blt $r0, I, .L17
159+
b .L19
201160
.align 3
202161

162+
/////// INCX == 1 && N < 8 ///////
163+
.L19:
164+
andi I, N, 3
165+
beqz I, .L999
166+
bnez DUMMY2, .L998 // if DUMMPY2 == 1, called from c/zscal.
167+
168+
bceqz $fcc0, .L998
169+
170+
bceqz $fcc1, .L998
171+
172+
b .L995 // alpha_r == 0.0 && alpha_i == 0.0
173+
174+
/////// INCX != 1 ////////
203175
.L22:
204-
bge $r0, I, .L997
205-
move XX, X
206176
CMPEQ $fcc0, ALPHAR, a1
207177
CMPEQ $fcc1, ALPHAI, a1
208-
bceqz $fcc0, .L23
209-
b .L24
210-
.align 3
178+
move XX, X
179+
bge $r0, I, .L29
180+
bnez DUMMY2, .L25 // if DUMMPY2 == 1, called from c/zscal.
211181

212-
.L23:
213-
bceqz $fcc1, .L224 //alpha_r != 0.0 && alpha_i != 0.0
214-
b .L223 //alpha_r != 0.0 && alpha_i == 0.0
182+
bceqz $fcc0, .L25
215183

216-
.L24:
217-
bceqz $fcc1, .L224 //alpha_r == 0.0 && alpha_i != 0.0
218-
b .L221 //alpha_r == 0.0 && alpha_i == 0.0
219-
.align 3
184+
bceqz $fcc1, .L25
220185

221-
.L221: //alpha_r == 0.0 && alpha_i == 0.0
186+
.L27: //alpha_r == 0.0 && alpha_i == 0.0
222187
#ifdef DOUBLE
223188
vstelm.d VXZ, X, 0, 0
224189
vstelm.d VXZ, X, 1 * SIZE, 0
@@ -246,92 +211,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
246211
#endif
247212
add.d X, X, INCX
248213
addi.d I, I, -1
249-
blt $r0, I, .L221
250-
b .L997
214+
blt $r0, I, .L27
215+
b .L29
251216
.align 3
252217

253-
.L223: //alpha_r != 0.0 && alpha_i == 0.0
254-
#ifdef DOUBLE
255-
ld.d t1, X, 0 * SIZE
256-
ld.d t2, X, 1 * SIZE
257-
add.d X, X, INCX
258-
ld.d t3, X, 0 * SIZE
259-
ld.d t4, X, 1 * SIZE
260-
add.d X, X, INCX
261-
vinsgr2vr.d x1, t1, 0
262-
vinsgr2vr.d x2, t2, 0
263-
vinsgr2vr.d x1, t3, 1
264-
vinsgr2vr.d x2, t4, 1
265-
vfmul.d x3, VXAR, x1
266-
vfmul.d x4, VXAR, x2
267-
vstelm.d x3, XX, 0 * SIZE, 0
268-
vstelm.d x4, XX, 1 * SIZE, 0
269-
add.d XX, XX, INCX
270-
vstelm.d x3, XX, 0 * SIZE, 1
271-
vstelm.d x4, XX, 1 * SIZE, 1
272-
add.d XX, XX, INCX
273-
274-
ld.d t1, X, 0 * SIZE
275-
ld.d t2, X, 1 * SIZE
276-
add.d X, X, INCX
277-
ld.d t3, X, 0 * SIZE
278-
ld.d t4, X, 1 * SIZE
279-
vinsgr2vr.d x1, t1, 0
280-
vinsgr2vr.d x2, t2, 0
281-
vinsgr2vr.d x1, t3, 1
282-
vinsgr2vr.d x2, t4, 1
283-
add.d X, X, INCX
284-
vfmul.d x3, VXAR, x1
285-
vfmul.d x4, VXAR, x2
286-
addi.d I, I, -1
287-
vstelm.d x3, XX, 0 * SIZE, 0
288-
vstelm.d x4, XX, 1 * SIZE, 0
289-
add.d XX, XX, INCX
290-
vstelm.d x3, XX, 0 * SIZE, 1
291-
vstelm.d x4, XX, 1 * SIZE, 1
292-
#else
293-
ld.w t1, X, 0 * SIZE
294-
ld.w t2, X, 1 * SIZE
295-
add.d X, X, INCX
296-
ld.w t3, X, 0 * SIZE
297-
ld.w t4, X, 1 * SIZE
298-
add.d X, X, INCX
299-
vinsgr2vr.w x1, t1, 0
300-
vinsgr2vr.w x2, t2, 0
301-
vinsgr2vr.w x1, t3, 1
302-
vinsgr2vr.w x2, t4, 1
303-
ld.w t1, X, 0 * SIZE
304-
ld.w t2, X, 1 * SIZE
305-
add.d X, X, INCX
306-
ld.w t3, X, 0 * SIZE
307-
ld.w t4, X, 1 * SIZE
308-
vinsgr2vr.w x1, t1, 2
309-
vinsgr2vr.w x2, t2, 2
310-
vinsgr2vr.w x1, t3, 3
311-
vinsgr2vr.w x2, t4, 3
312-
add.d X, X, INCX
313-
314-
vfmul.s x3, VXAR, x1
315-
vfmul.s x4, VXAR, x2
316-
addi.d I, I, -1
317-
vstelm.w x3, XX, 0 * SIZE, 0
318-
vstelm.w x4, XX, 1 * SIZE, 0
319-
add.d XX, XX, INCX
320-
vstelm.w x3, XX, 0 * SIZE, 1
321-
vstelm.w x4, XX, 1 * SIZE, 1
322-
add.d XX, XX, INCX
323-
vstelm.w x3, XX, 0 * SIZE, 2
324-
vstelm.w x4, XX, 1 * SIZE, 2
325-
add.d XX, XX, INCX
326-
vstelm.w x3, XX, 0 * SIZE, 3
327-
vstelm.w x4, XX, 1 * SIZE, 3
328-
#endif
329-
add.d XX, XX, INCX
330-
blt $r0, I, .L223
331-
b .L997
332-
.align 3
333-
334-
.L224: //alpha_r != 0.0 && alpha_i != 0.0
218+
.L25:
335219
#ifdef DOUBLE
336220
ld.d t1, X, 0 * SIZE
337221
ld.d t2, X, 1 * SIZE
@@ -414,15 +298,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
414298
vstelm.w x4, XX, 1 * SIZE, 3
415299
#endif
416300
add.d XX, XX, INCX
417-
blt $r0, I, .L224
418-
b .L997
301+
blt $r0, I, .L25
302+
b .L29
419303
.align 3
420304

421-
.L997:
422-
andi I, N, 3
423-
bge $r0, I, .L999
424-
.align 3
305+
/////// INCX != 1 && N < 8 ///////
306+
.L29:
307+
andi I, N, 3
308+
beqz I, .L999
309+
bnez DUMMY2, .L998 // if DUMMPY2 == 1, called from c/zscal.
310+
311+
bceqz $fcc0, .L998
312+
313+
bceqz $fcc1, .L998
425314

315+
b .L995 // alpha_r == 0.0 && alpha_i == 0.0
316+
317+
.L995: // alpha_r == 0.0 && alpha_i == 0.0
318+
ST a1, X, 0 * SIZE
319+
ST a1, X, 1 * SIZE
320+
addi.d I, I, -1
321+
add.d X, X, INCX
322+
blt $r0, I, .L995
323+
b .L999
426324
.L998:
427325
LD a1, X, 0 * SIZE
428326
LD a2, X, 1 * SIZE
@@ -435,7 +333,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
435333
ST s2, X, 1 * SIZE
436334
add.d X, X, INCX
437335
blt $r0, I, .L998
438-
.align 3
336+
b .L999
439337

440338
.L999:
441339
move $r4, $r12

0 commit comments

Comments
 (0)