@@ -43,41 +43,41 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43
43
static void dscal_kernel_8 ( BLASLONG n , FLOAT * da , FLOAT * x )
44
44
{
45
45
46
- BLASLONG i ;
47
- FLOAT alpha = * da ;
48
-
49
- for ( i = 0 ; i < n ; i += 8 )
50
- {
51
- x [0 ] *= alpha ;
52
- x [1 ] *= alpha ;
53
- x [2 ] *= alpha ;
54
- x [3 ] *= alpha ;
55
- x [4 ] *= alpha ;
56
- x [5 ] *= alpha ;
57
- x [6 ] *= alpha ;
58
- x [7 ] *= alpha ;
59
- x += 8 ;
60
- }
46
+ BLASLONG i ;
47
+ FLOAT alpha = * da ;
48
+
49
+ for ( i = 0 ; i < n ; i += 8 )
50
+ {
51
+ x [0 ] *= alpha ;
52
+ x [1 ] *= alpha ;
53
+ x [2 ] *= alpha ;
54
+ x [3 ] *= alpha ;
55
+ x [4 ] *= alpha ;
56
+ x [5 ] *= alpha ;
57
+ x [6 ] *= alpha ;
58
+ x [7 ] *= alpha ;
59
+ x += 8 ;
60
+ }
61
61
62
62
}
63
63
64
64
65
65
static void dscal_kernel_8_zero ( BLASLONG n , FLOAT * alpha , FLOAT * x )
66
66
{
67
67
68
- BLASLONG i ;
69
- for ( i = 0 ; i < n ; i += 8 )
70
- {
71
- x [0 ] = 0.0 ;
72
- x [1 ] = 0.0 ;
73
- x [2 ] = 0.0 ;
74
- x [3 ] = 0.0 ;
75
- x [4 ] = 0.0 ;
76
- x [5 ] = 0.0 ;
77
- x [6 ] = 0.0 ;
78
- x [7 ] = 0.0 ;
79
- x += 8 ;
80
- }
68
+ BLASLONG i ;
69
+ for ( i = 0 ; i < n ; i += 8 )
70
+ {
71
+ x [0 ] = 0.0 ;
72
+ x [1 ] = 0.0 ;
73
+ x [2 ] = 0.0 ;
74
+ x [3 ] = 0.0 ;
75
+ x [4 ] = 0.0 ;
76
+ x [5 ] = 0.0 ;
77
+ x [6 ] = 0.0 ;
78
+ x [7 ] = 0.0 ;
79
+ x += 8 ;
80
+ }
81
81
82
82
}
83
83
@@ -89,51 +89,51 @@ static void dscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_
89
89
static void dscal_kernel_inc_8 (BLASLONG n , FLOAT * alpha , FLOAT * x , BLASLONG inc_x )
90
90
{
91
91
92
- FLOAT * x1 = NULL ;
93
- BLASLONG inc_x3 ;
92
+ FLOAT * x1 = NULL ;
93
+ BLASLONG inc_x3 ;
94
94
95
- inc_x <<= 3 ;
96
- inc_x3 = (inc_x << 1 ) + inc_x ;
95
+ inc_x <<= 3 ;
96
+ inc_x3 = (inc_x << 1 ) + inc_x ;
97
97
98
98
__asm__ __volatile__
99
99
(
100
- "movddup (%3), %%xmm0 \n\t" // alpha
100
+ "movddup (%3), %%xmm0 \n\t" // alpha
101
101
102
- "leaq (%1,%4,4), %2 \n\t"
102
+ "leaq (%1,%4,4), %2 \n\t"
103
103
104
- ".p2align 4 \n\t"
104
+ ".p2align 4 \n\t"
105
105
106
- "1: \n\t"
107
- "movsd (%1) , %%xmm4 \n\t"
108
- "movhpd (%1,%4,1), %%xmm4 \n\t"
109
- "movsd (%1,%4,2), %%xmm5 \n\t"
110
- "movhpd (%1,%5,1), %%xmm5 \n\t"
106
+ "1: \n\t"
107
+ "movsd (%1) , %%xmm4 \n\t"
108
+ "movhpd (%1,%4,1), %%xmm4 \n\t"
109
+ "movsd (%1,%4,2), %%xmm5 \n\t"
110
+ "movhpd (%1,%5,1), %%xmm5 \n\t"
111
111
112
- "movsd (%2) , %%xmm6 \n\t"
113
- "movhpd (%2,%4,1), %%xmm6 \n\t"
114
- "movsd (%2,%4,2), %%xmm7 \n\t"
115
- "movhpd (%2,%5,1), %%xmm7 \n\t"
112
+ "movsd (%2) , %%xmm6 \n\t"
113
+ "movhpd (%2,%4,1), %%xmm6 \n\t"
114
+ "movsd (%2,%4,2), %%xmm7 \n\t"
115
+ "movhpd (%2,%5,1), %%xmm7 \n\t"
116
116
117
- "mulpd %%xmm0, %%xmm4 \n\t"
118
- "mulpd %%xmm0, %%xmm5 \n\t"
119
- "mulpd %%xmm0, %%xmm6 \n\t"
120
- "mulpd %%xmm0, %%xmm7 \n\t"
117
+ "mulpd %%xmm0, %%xmm4 \n\t"
118
+ "mulpd %%xmm0, %%xmm5 \n\t"
119
+ "mulpd %%xmm0, %%xmm6 \n\t"
120
+ "mulpd %%xmm0, %%xmm7 \n\t"
121
121
122
- "movsd %%xmm4 , (%1) \n\t"
123
- "movhpd %%xmm4 , (%1,%4,1) \n\t"
124
- "movsd %%xmm5 , (%1,%4,2) \n\t"
125
- "movhpd %%xmm5 , (%1,%5,1) \n\t"
122
+ "movsd %%xmm4 , (%1) \n\t"
123
+ "movhpd %%xmm4 , (%1,%4,1) \n\t"
124
+ "movsd %%xmm5 , (%1,%4,2) \n\t"
125
+ "movhpd %%xmm5 , (%1,%5,1) \n\t"
126
126
127
- "movsd %%xmm6 , (%2) \n\t"
128
- "movhpd %%xmm6 , (%2,%4,1) \n\t"
129
- "movsd %%xmm7 , (%2,%4,2) \n\t"
130
- "movhpd %%xmm7 , (%2,%5,1) \n\t"
127
+ "movsd %%xmm6 , (%2) \n\t"
128
+ "movhpd %%xmm6 , (%2,%4,1) \n\t"
129
+ "movsd %%xmm7 , (%2,%4,2) \n\t"
130
+ "movhpd %%xmm7 , (%2,%5,1) \n\t"
131
131
132
- "leaq (%1,%4,8), %1 \n\t"
133
- "leaq (%2,%4,8), %2 \n\t"
132
+ "leaq (%1,%4,8), %1 \n\t"
133
+ "leaq (%2,%4,8), %2 \n\t"
134
134
135
- "subq $8, %0 \n\t"
136
- "jnz 1b \n\t"
135
+ "subq $8, %0 \n\t"
136
+ "jnz 1b \n\t"
137
137
138
138
:
139
139
"+r" (n ), // 0
@@ -150,106 +150,96 @@ static void dscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_
150
150
"%xmm12" , "%xmm13" , "%xmm14" , "%xmm15" ,
151
151
"memory"
152
152
);
153
-
154
-
155
153
}
156
154
157
155
int CNAME (BLASLONG n , BLASLONG dummy0 , BLASLONG dummy1 , FLOAT da , FLOAT * x , BLASLONG inc_x , FLOAT * y , BLASLONG inc_y , FLOAT * dummy , BLASLONG dummy2 )
158
156
{
159
- BLASLONG i = 0 ,j = 0 ;
160
-
161
- if ( inc_x != 1 )
162
- {
163
-
164
- if ( da == 0.0 )
165
- {
166
-
167
- BLASLONG n1 = n & -2 ;
168
-
169
- while (j < n1 )
170
- {
171
-
172
- if (isinf (x [i ])|| isnan (x [i ]))
173
- x [i ]= NAN ;
174
- else x [i ]= 0.0 ;
175
- if (isinf (x [i + inc_x ])|| isnan (x [i + inc_x ]))
176
- x [i + inc_x ]= NAN ;
177
- else x [i + inc_x ]= 0.0 ;
178
- i += 2 * inc_x ;
179
- j += 2 ;
180
-
181
- }
182
-
183
- while (j < n )
184
- {
185
-
186
- if (isinf (x [i ])|| isnan (x [i ]))
187
- x [i ]= NAN ;
188
- else x [i ]= 0.0 ;
189
- i += inc_x ;
190
- j ++ ;
191
-
192
- }
193
- }
194
- else
195
- {
196
-
197
- BLASLONG n1 = n & -8 ;
198
- if ( n1 > 0 )
199
- {
200
- dscal_kernel_inc_8 (n1 , & da , x , inc_x );
201
- i = n1 * inc_x ;
202
- j = n1 ;
203
- }
204
-
205
- while (j < n )
206
- {
207
-
208
- x [i ] *= da ;
209
- i += inc_x ;
210
- j ++ ;
211
-
212
- }
213
-
214
- }
215
-
216
- return (0 );
217
- }
218
-
219
- BLASLONG n1 = n & -8 ;
220
- if ( n1 > 0 )
221
- {
222
- // if ( da == 0.0 )
223
- // dscal_kernel_8_zero(n1 , &da , x);
224
- // else
225
- dscal_kernel_8 (n1 , & da , x );
226
- }
227
-
228
- if ( da == 0.0 )
229
- {
230
- for ( i = n1 ; i < n ; i ++ )
231
- {
232
- if (isinf (x [i ])|| isnan (x [i ]))
233
- x [i ]= NAN ;
234
- else x [i ] = 0.0 ;
235
- }
236
- }
237
- else if (isinf (da )){
238
- for ( i = n1 ; i < n ; i ++ )
239
- if (x [i ]== 0. ) x [i ]= NAN ;
240
- else x [i ] *=da ;
241
- }
242
- else
243
- {
244
-
245
- for ( i = n1 ; i < n ; i ++ )
246
- {
247
- if (isinf (x [i ]))
248
- x [i ]= NAN ;
249
- else x [i ] *= da ;
250
- }
251
- }
252
- return (0 );
157
+ BLASLONG i = 0 , j = 0 ;
158
+
159
+ // Resolved issue 4728 when the caller is dscal
160
+ if (dummy2 == 1 && da == 0.0 )
161
+ {
162
+ if ( inc_x != 1 )
163
+ {
164
+ BLASLONG n1 = n & -8 ;
165
+ if ( n1 > 0 )
166
+ {
167
+ dscal_kernel_inc_8 (n1 , & da , x , inc_x );
168
+ i = n1 * inc_x ;
169
+ j = n1 ;
170
+ }
171
+ while (j < n )
172
+ {
173
+ x [i ] *= da ;
174
+ i += inc_x ;
175
+ j ++ ;
176
+ }
177
+ }
178
+ else
179
+ {
180
+ BLASLONG n1 = n & -8 ;
181
+ if ( n1 > 0 )
182
+ dscal_kernel_8 (n1 , & da , x );
183
+ for ( i = n1 ; i < n ; i ++ )
184
+ x [i ] *= da ;
185
+ }
186
+ }
187
+ else
188
+ {
189
+ if ( inc_x != 1 )
190
+ {
191
+ if ( da == 0.0 )
192
+ {
193
+ BLASLONG n1 = n & -2 ;
194
+ while (j < n1 )
195
+ {
196
+ x [i ] = 0.0 ;
197
+ x [i + inc_x ] = 0.0 ;
198
+ i += 2 * inc_x ;
199
+ j += 2 ;
200
+ }
201
+ while (j < n )
202
+ {
203
+ x [i ] = 0.0 ;
204
+ i += inc_x ;
205
+ j ++ ;
206
+ }
207
+ }
208
+ else
209
+ {
210
+ BLASLONG n1 = n & -8 ;
211
+ if ( n1 > 0 )
212
+ {
213
+ dscal_kernel_inc_8 (n1 , & da , x , inc_x );
214
+ i = n1 * inc_x ;
215
+ j = n1 ;
216
+ }
217
+ while (j < n )
218
+ {
219
+ x [i ] *= da ;
220
+ i += inc_x ;
221
+ j ++ ;
222
+ }
223
+ }
224
+ }
225
+ else
226
+ {
227
+ if ( da == 0.0 )
228
+ {
229
+ BLASLONG n1 = n & -8 ;
230
+ if ( n1 > 0 )
231
+ dscal_kernel_8_zero (n1 , & da , x );
232
+ for ( i = n1 ; i < n ; i ++ )
233
+ x [i ] = 0.0 ;
234
+ }
235
+ else
236
+ {
237
+ BLASLONG n1 = n & -8 ;
238
+ if ( n1 > 0 )
239
+ dscal_kernel_8 (n1 , & da , x );
240
+ for ( i = n1 ; i < n ; i ++ )
241
+ x [i ] *= da ;
242
+ }
243
+ }
244
+ }
253
245
}
254
-
255
-
0 commit comments