@@ -51,6 +51,8 @@ PROLOGUE
51
51
LDINT INCX, 0 (INCX)
52
52
LDINT INCY, 0 (INCY)
53
53
#endif
54
+
55
+ /* init $f8 and $f9 to zero */
54
56
SUB s1, s1, s1
55
57
SUB s2, s2, s2
56
58
slli.d INCX, INCX, BASE_SHIFT
@@ -59,25 +61,33 @@ PROLOGUE
59
61
bge $r0, N, .L999
60
62
bne INCX, TEMP, .L20 /* inc_x=1 */
61
63
bne INCY, TEMP, .L20 /* inc_y=1 */
62
- #ifdef DOUBLE
63
- srai.d I, N, 4
64
- #else
65
- srai.d I, N, 5
66
- #endif
64
+
65
+ /* !((inc_x == 1) && (inc_y == 1)) */
67
66
68
67
/* init $xr8 and $xr9 to zero */
69
68
#ifdef DOUBLE
70
69
xvldrepl.d $xr0, X, 0
71
70
#else
72
71
xvldrepl.w $xr0, X, 0
73
72
#endif
73
+ #ifdef DSDOT
74
+ xvfcvtl.d.s $xr0, $xr0
75
+ xvfsub.d $xr8, $xr0, $xr0
76
+ xvfsub.d $xr9, $xr0, $xr0
77
+ #else
74
78
XVFSUB $xr8, $xr0, $xr0
75
79
XVFSUB $xr9, $xr0, $xr0
80
+ #endif
76
81
77
- /* !((inc_x == 1) && (inc_y == 1)) */
78
- bge $r0, I, .L12 /* <32 */
82
+ #ifdef DOUBLE
83
+ srai.d I, N, 4
84
+ #else
85
+ srai.d I, N, 5
86
+ #endif
87
+ bge $r0, I, .L12 /* FLOAT: <32 ; DOUBLE: <16 */
88
+ .align 3
79
89
.L11:
80
- /* case 32~ */
90
+ /* FLOAT: 32~ ; DOUBLE: 16 ~ */
81
91
xvld $xr0, X, 0
82
92
xvld $xr1, X, 32
83
93
xvld $xr2, X, 64
@@ -89,11 +99,39 @@ PROLOGUE
89
99
addi.w I, I, -1
90
100
addi.d X, X, 128
91
101
addi.d Y, Y, 128
102
+ #ifdef DSDOT
103
+ xvfcvtl.d.s $xr10, $xr0
104
+ xvfcvtl.d.s $xr11, $xr4
105
+ xvfcvth.d.s $xr12, $xr0
106
+ xvfcvth.d.s $xr13, $xr4
107
+ xvfmadd.d $xr8, $xr10, $xr12, $xr8
108
+ xvfmadd.d $xr9, $xr11, $xr13, $xr9
109
+ xvfcvtl.d.s $xr10, $xr1
110
+ xvfcvtl.d.s $xr11, $xr5
111
+ xvfcvth.d.s $xr12, $xr1
112
+ xvfcvth.d.s $xr13, $xr5
113
+ xvfmadd.d $xr8, $xr10, $xr12, $xr8
114
+ xvfmadd.d $xr9, $xr11, $xr13, $xr9
115
+ xvfcvtl.d.s $xr10, $xr2
116
+ xvfcvtl.d.s $xr11, $xr6
117
+ xvfcvth.d.s $xr12, $xr2
118
+ xvfcvth.d.s $xr13, $xr6
119
+ xvfmadd.d $xr8, $xr10, $xr12, $xr8
120
+ xvfmadd.d $xr9, $xr11, $xr13, $xr9
121
+ xvfcvtl.d.s $xr10, $xr3
122
+ xvfcvtl.d.s $xr11, $xr7
123
+ xvfcvth.d.s $xr12, $xr3
124
+ xvfcvth.d.s $xr13, $xr7
125
+ xvfmadd.d $xr8, $xr10, $xr12, $xr8
126
+ xvfmadd.d $xr9, $xr11, $xr13, $xr9
127
+ #else
92
128
XVFMADD $xr8, $xr0, $xr4, $xr8
93
129
XVFMADD $xr9, $xr1, $xr5, $xr9
94
130
XVFMADD $xr8, $xr2, $xr6, $xr8
95
131
XVFMADD $xr9, $xr3, $xr7, $xr9
132
+ #endif
96
133
bnez I, .L11
134
+ .align 3
97
135
.L12:
98
136
#ifdef DOUBLE
99
137
andi I, N, 0xf
@@ -102,18 +140,37 @@ PROLOGUE
102
140
andi I, N, 0x1f
103
141
srai.d I, I, 3
104
142
#endif
105
- bge $r0, I, .L14 /* <8 */
143
+ bge $r0, I, .L14 /* DOUBLE: <4 ; FLOAT: <8 */
144
+ .align 3
106
145
.L13:
107
- /* case 8~31 */
146
+ /* FLOAT: 8~31 ; DOUBLE: 4~15 */
108
147
xvld $xr0, X, 0
109
148
xvld $xr4, Y, 0
110
149
addi.w I, I, -1
111
150
addi.d X, X, 32
112
151
addi.d Y, Y, 32
152
+ #ifdef DSDOT
153
+ xvfcvtl.d.s $xr10, $xr0
154
+ xvfcvtl.d.s $xr11, $xr4
155
+ xvfcvth.d.s $xr12, $xr0
156
+ xvfcvth.d.s $xr13, $xr4
157
+ xvfmadd.d $xr8, $xr10, $xr12, $xr8
158
+ xvfmadd.d $xr9, $xr11, $xr13, $xr9
159
+ #else
113
160
XVFMADD $xr8, $xr0, $xr4, $xr8
161
+ #endif
114
162
bnez I, .L13
163
+ .align 3
115
164
.L14:
116
165
/* store dot in s1 $f8 */
166
+ #ifdef DSDOT
167
+ xvfadd.d $xr8, $xr8, $xr9
168
+ fsub .s s2, s2, s2, /* set s2 to 0.0 */
169
+ xvpermi.q $xr0, $xr8, 0x1
170
+ vfadd.d $vr8, $vr8, $vr0
171
+ vpackod.d $vr0, $vr8, $vr8
172
+ vfadd.d $vr8, $vr8, $vr0
173
+ #else
117
174
XVFADD $xr8, $xr8, $xr9
118
175
SUB s2, s2, s2 /* set s2 to 0.0 */
119
176
xvpermi.q $xr0, $xr8, 0x1
@@ -125,7 +182,9 @@ PROLOGUE
125
182
VFADD $vr8, $vr8, $vr0
126
183
vpackod.w $vr0, $vr8, $vr8
127
184
VFADD $vr8, $vr8, $vr0
128
- #endif
185
+ #endif /* defined DOUBLE */
186
+ #endif /* defined DSDOT */
187
+ .align 3
129
188
.L15:
130
189
#ifdef DOUBLE
131
190
andi I, N, 0x3
@@ -135,7 +194,7 @@ PROLOGUE
135
194
bge $r0, I, .L999 /* =0 */
136
195
.align 3
137
196
.L16:
138
- /* case 1~7 */
197
+ /* FLOAT: 1~7 ; DOUBLE: 1~3 */
139
198
LD a1, X, 0
140
199
LD b1, Y, 0
141
200
#ifdef DSDOT
0 commit comments