@@ -47,6 +47,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
47
47
#define VX4 $xr21
48
48
#define res1 $xr19
49
49
#define res2 $xr20
50
+ #define RCP $f2
51
+ #define VALPHA $xr3
50
52
51
53
PROLOGUE
52
54
@@ -55,10 +57,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
55
57
LDINT INCX, 0 (INCX)
56
58
#endif
57
59
58
- xvxor.v res1, res1, res1
59
- xvxor.v res2, res2, res2
60
60
bge $r0, N, .L999
61
61
beq $r0, INCX, .L999
62
+
63
+ addi.d $sp, $sp, -32
64
+ st.d $ra, $sp, 0
65
+ st.d N, $sp, 8
66
+ st.d X, $sp, 16
67
+ st.d INCX, $sp, 24
68
+ #ifdef DYNAMIC_ARCH
69
+ bl camax_k_LA264
70
+ #else
71
+ bl camax_k
72
+ #endif
73
+ ld.d $ra, $sp, 0
74
+ ld.d N, $sp, 8
75
+ ld.d X, $sp, 16
76
+ ld.d INCX, $sp, 24
77
+ addi.d $sp, $sp, 32
78
+
79
+ frecip.s RCP, $f0
80
+ vreplvei.w $vr3, $vr2, 0
81
+ xvpermi.d VALPHA, $xr3,0x00
82
+ xvxor.v res1, res1, res1
83
+ xvxor.v res2, res2, res2
84
+ fcmp.ceq.s $fcc0, $f0, $f19
85
+ bcnez $fcc0, .L999
86
+
62
87
li.d TEMP, SIZE
63
88
slli.d INCX, INCX, ZBASE_SHIFT
64
89
srai.d I, N, 2
@@ -67,13 +92,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
67
92
.align 3
68
93
69
94
.L10:
70
- xvld VX0, X, 0 * SIZE
71
- xvfcvtl.d.s VX1, VX0
72
- xvfcvth.d.s VX2, VX0
73
- xvfmadd.d res1, VX1, VX1, res1
74
- xvfmadd.d res2, VX2, VX2, res2
75
95
addi.d I, I, -1
76
- addi.d X, X, 8 * SIZE
96
+
97
+ xvld VX0, X, 0 * SIZE
98
+ xvld VX1, X, 8 * SIZE
99
+ xvfmul.s VX0, VX0, VALPHA
100
+ xvfmul.s VX1, VX1, VALPHA
101
+ xvfmadd.s res1, VX0, VX0, res1
102
+ xvfmadd.s res2, VX1, VX1, res2
103
+
104
+ addi.d X, X, 16 * SIZE
77
105
blt $r0, I, .L10
78
106
.align 3
79
107
b .L996
@@ -103,22 +131,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
103
131
xvinsgr2vr.w VX0, t3, 6
104
132
xvinsgr2vr.w VX0, t4, 7
105
133
add .d X, X, INCX
106
- xvfcvtl.d.s VX1, VX0
107
- xvfcvth.d.s VX2, VX0
108
- xvfmadd.d res1, VX1, VX1, res1
109
- xvfmadd.d res2, VX2, VX2, res2
134
+ xvfmul.s VX0, VX0, VALPHA
135
+ xvfmadd.s res2, VX0, VX0, res2
110
136
addi.d I, I, -1
111
137
blt $r0, I, .L21
112
138
b .L996
113
139
114
140
.L996:
115
- xvfadd.d res1, res1, res2
116
- xvpickve.d VX1, res1, 1
117
- xvpickve.d VX2, res1, 2
118
- xvpickve.d VX3, res1, 3
119
- xvfadd.d res1, VX1, res1
120
- xvfadd.d res1, VX2, res1
121
- xvfadd.d res1, VX3, res1
141
+ xvfadd.s res1, res1, res2
142
+ xvpermi.d VX1, res1, 0x4e
143
+ xvfadd.s res1, res1, VX1
144
+ vreplvei.w $vr17, $vr19, 1
145
+ vreplvei.w $vr18, $vr19, 2
146
+ vreplvei.w $vr21, $vr19, 3
147
+ xvfadd.s res1, VX2, res1
148
+ xvfadd.s res1, VX3, res1
149
+ xvfadd.s res1, VX4, res1
122
150
.align 3
123
151
124
152
.L997:
@@ -130,18 +158,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
130
158
fld .s a1, X, 0 * SIZE
131
159
fld .s a2, X, 1 * SIZE
132
160
addi.d I, I, -1
133
- fcvt.d. s a1, a1
134
- fcvt.d. s a2, a2
135
- fmadd.d res, a1, a1, res
136
- fmadd.d res, a2, a2, res
161
+ fmul . s a1, a1, RCP
162
+ fmul . s a2, a2, RCP
163
+ fmadd.s res, a1, a1, res
164
+ fmadd.s res, a2, a2, res
137
165
add .d X, X, INCX
138
166
blt $r0, I, .L998
139
167
.align 3
140
168
141
169
.L999:
142
- fsqrt .d res, res
170
+ fsqrt .s res, res
171
+ fmul .s $f0, res, $f0
143
172
move $r4, $r17
144
- fcvt.s.d $f0, res
145
173
jirl $r0, $r1, 0x0
146
174
147
175
EPILOGUE
0 commit comments