@@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32
32
#define NBMAX 1024
33
33
34
34
35
- static const unsigned char swap_mask_arr []= { 4 ,5 ,6 ,7 ,0 ,1 ,2 ,3 , 12 ,13 ,14 ,15 , 8 ,9 ,10 ,11 };
35
+ static const unsigned char __attribute__(( aligned ( 16 ))) swap_mask_arr []= { 4 ,5 ,6 ,7 ,0 ,1 ,2 ,3 , 12 ,13 ,14 ,15 , 8 ,9 ,10 ,11 };
36
36
37
37
38
38
static void cgemv_kernel_4x4 (BLASLONG n , BLASLONG lda , FLOAT * ap , FLOAT * x , FLOAT * y ) {
@@ -62,23 +62,24 @@ static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA
62
62
register __vector float vx3_r = {x [6 ], - x [6 ],x [6 ], - x [6 ]};
63
63
register __vector float vx3_i = {x [7 ], x [7 ],x [7 ], x [7 ]};
64
64
#endif
65
- register __vector float * vy = (__vector float * ) y ;
65
+ register __vector float * vptr_y = (__vector float * ) y ;
66
66
register __vector float * vptr_a0 = (__vector float * ) a0 ;
67
67
register __vector float * vptr_a1 = (__vector float * ) a1 ;
68
68
register __vector float * vptr_a2 = (__vector float * ) a2 ;
69
69
register __vector float * vptr_a3 = (__vector float * ) a3 ;
70
70
BLASLONG i = 0 ;
71
- for (;i < n / 2 ; i += 2 ) {
72
- register __vector float vy_0 = vy [i ];
73
- register __vector float vy_1 = vy [i + 1 ];
74
- register __vector float va0 = vptr_a0 [i ];
75
- register __vector float va1 = vptr_a1 [i ];
76
- register __vector float va2 = vptr_a2 [i ];
77
- register __vector float va3 = vptr_a3 [i ];
78
- register __vector float va0_1 = vptr_a0 [i + 1 ];
79
- register __vector float va1_1 = vptr_a1 [i + 1 ];
80
- register __vector float va2_1 = vptr_a2 [i + 1 ];
81
- register __vector float va3_1 = vptr_a3 [i + 1 ];
71
+ BLASLONG i2 = 16 ;
72
+ for (;i < n * 8 ; i += 32 ,i2 += 32 ) {
73
+ register __vector float vy_0 = vec_vsx_ld (i ,vptr_y );
74
+ register __vector float vy_1 = vec_vsx_ld (i2 ,vptr_y );
75
+ register __vector float va0 = vec_vsx_ld (i ,vptr_a0 );
76
+ register __vector float va1 = vec_vsx_ld (i , vptr_a1 );
77
+ register __vector float va2 = vec_vsx_ld (i ,vptr_a2 );
78
+ register __vector float va3 = vec_vsx_ld (i ,vptr_a3 );
79
+ register __vector float va0_1 = vec_vsx_ld (i2 ,vptr_a0 );
80
+ register __vector float va1_1 = vec_vsx_ld (i2 ,vptr_a1 );
81
+ register __vector float va2_1 = vec_vsx_ld (i2 ,vptr_a2 );
82
+ register __vector float va3_1 = vec_vsx_ld (i2 ,vptr_a3 );
82
83
83
84
vy_0 += va0 * vx0_r + va1 * vx1_r + va2 * vx2_r + va3 * vx3_r ;
84
85
vy_1 += va0_1 * vx0_r + va1_1 * vx1_r + va2_1 * vx2_r + va3_1 * vx3_r ;
@@ -93,8 +94,8 @@ static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA
93
94
vy_0 += va0 * vx0_i + va1 * vx1_i + va2 * vx2_i + va3 * vx3_i ;
94
95
vy_1 += va0_1 * vx0_i + va1_1 * vx1_i + va2_1 * vx2_i + va3_1 * vx3_i ;
95
96
96
- vy [ i ] = vy_0 ;
97
- vy [ i + 1 ] = vy_1 ;
97
+ vec_vsx_st ( vy_0 , i , vptr_y ) ;
98
+ vec_vsx_st ( vy_1 , i2 , vptr_y ) ;
98
99
}
99
100
100
101
}
@@ -118,26 +119,28 @@ static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA
118
119
register __vector float vx1_r = {x [2 ], - x [2 ],x [2 ], - x [2 ]};
119
120
register __vector float vx1_i = {x [3 ], x [3 ],x [3 ], x [3 ]};
120
121
#endif
121
- register __vector float * vy = (__vector float * ) y ;
122
+ register __vector float * vptr_y = (__vector float * ) y ;
122
123
register __vector float * vptr_a0 = (__vector float * ) a0 ;
123
124
register __vector float * vptr_a1 = (__vector float * ) a1 ;
124
- BLASLONG i = 0 ;
125
- for (;i < n / 2 ; i += 2 ) {
126
- register __vector float vy_0 = vy [i ];
127
- register __vector float vy_1 = vy [i + 1 ];
128
- register __vector float va0 = vptr_a0 [i ];
129
- register __vector float va1 = vptr_a1 [i ];
130
- register __vector float va0_1 = vptr_a0 [i + 1 ];
131
- register __vector float va1_1 = vptr_a1 [i + 1 ];
125
+ BLASLONG i = 0 ;
126
+ BLASLONG i2 = 16 ;
127
+ for (;i < n * 8 ; i += 32 , i2 += 32 ) {
128
+ register __vector float vy_0 = vec_vsx_ld (i ,vptr_y );
129
+ register __vector float vy_1 = vec_vsx_ld (i2 ,vptr_y );
130
+ register __vector float va0 = vec_vsx_ld (i ,vptr_a0 );
131
+ register __vector float va1 = vec_vsx_ld (i , vptr_a1 );
132
+ register __vector float va0_1 = vec_vsx_ld (i2 ,vptr_a0 );
133
+ register __vector float va1_1 = vec_vsx_ld (i2 ,vptr_a1 );
134
+
132
135
register __vector float va0x = vec_perm (va0 , va0 ,swap_mask );
133
136
register __vector float va0x_1 = vec_perm (va0_1 , va0_1 ,swap_mask );
134
137
register __vector float va1x = vec_perm (va1 , va1 ,swap_mask );
135
138
register __vector float va1x_1 = vec_perm (va1_1 , va1_1 ,swap_mask );
136
139
vy_0 += va0 * vx0_r + va1 * vx1_r + va0x * vx0_i + va1x * vx1_i ;
137
140
vy_1 += va0_1 * vx0_r + va1_1 * vx1_r + va0x_1 * vx0_i + va1x_1 * vx1_i ;
138
141
139
- vy [ i ] = vy_0 ;
140
- vy [ i + 1 ] = vy_1 ;
142
+ vec_vsx_st ( vy_0 , i , vptr_y ) ;
143
+ vec_vsx_st ( vy_1 , i2 , vptr_y ) ;
141
144
}
142
145
143
146
}
@@ -154,29 +157,31 @@ static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) {
154
157
register __vector float vx0_r = {x [0 ], - x [0 ],x [0 ], - x [0 ]};
155
158
register __vector float vx0_i = {x [1 ], x [1 ],x [1 ], x [1 ]};
156
159
#endif
157
- register __vector float * vy = (__vector float * ) y ;
160
+ register __vector float * vptr_y = (__vector float * ) y ;
158
161
register __vector float * vptr_a0 = (__vector float * ) ap ;
159
162
BLASLONG i = 0 ;
160
- for (;i < n / 2 ; i += 2 ) {
161
- register __vector float vy_0 = vy [i ];
162
- register __vector float vy_1 = vy [i + 1 ];
163
- register __vector float va0 = vptr_a0 [i ];
164
- register __vector float va0_1 = vptr_a0 [i + 1 ];
163
+ BLASLONG i2 = 16 ;
164
+ for (;i < n * 8 ; i += 32 , i2 += 32 ) {
165
+ register __vector float vy_0 = vec_vsx_ld (i ,vptr_y );
166
+ register __vector float vy_1 = vec_vsx_ld (i2 ,vptr_y );
167
+ register __vector float va0 = vec_vsx_ld (i ,vptr_a0 );
168
+ register __vector float va0_1 = vec_vsx_ld (i2 ,vptr_a0 );
169
+
165
170
register __vector float va0x = vec_perm (va0 , va0 ,swap_mask );
166
171
register __vector float va0x_1 = vec_perm (va0_1 , va0_1 ,swap_mask );
167
172
vy_0 += va0 * vx0_r + va0x * vx0_i ;
168
173
vy_1 += va0_1 * vx0_r + va0x_1 * vx0_i ;
169
174
170
- vy [ i ] = vy_0 ;
171
- vy [ i + 1 ] = vy_1 ;
175
+ vec_vsx_st ( vy_0 , i , vptr_y ) ;
176
+ vec_vsx_st ( vy_1 , i2 , vptr_y ) ;
172
177
}
173
178
}
174
179
175
180
176
181
177
182
178
183
static void add_y (BLASLONG n , FLOAT * src , FLOAT * dest , BLASLONG inc_dest , FLOAT alpha_r , FLOAT alpha_i ) {
179
- BLASLONG i ;
184
+ BLASLONG i = 0 ;
180
185
181
186
182
187
if (inc_dest != 2 ) {
@@ -213,20 +218,24 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT
213
218
214
219
register __vector float * vptr_src = (__vector float * ) src ;
215
220
register __vector float * vptr_y = (__vector float * ) dest ;
216
- for (i = 0 ; i < n /2 ; i += 2 ){
217
221
218
- register __vector float vy_0 = vptr_y [i ];
219
- register __vector float vy_1 = vptr_y [i + 1 ];
222
+ BLASLONG i2 = 16 ;
223
+ for (;i < n * 8 ; i += 32 , i2 += 32 ) {
224
+ register __vector float vy_0 = vec_vsx_ld (i ,vptr_y );
225
+ register __vector float vy_1 = vec_vsx_ld (i2 ,vptr_y );
226
+
220
227
221
- register __vector float vsrc = vptr_src [i ];
222
- register __vector float vsrc_1 = vptr_src [i + 1 ];
223
- register __vector float vsrcx = vec_perm (vsrc , vsrc , swap_mask );
224
- register __vector float vsrcx_1 = vec_perm (vsrc_1 , vsrc_1 , swap_mask );
228
+ register __vector float vsrc = vec_vsx_ld (i ,vptr_src );
229
+ register __vector float vsrc_1 = vec_vsx_ld (i2 ,vptr_src );
225
230
226
- vy_0 += vsrc * valpha_r + vsrcx * valpha_i ;
227
- vy_1 += vsrc_1 * valpha_r + vsrcx_1 * valpha_i ;
228
- vptr_y [i ] = vy_0 ;
229
- vptr_y [i + 1 ] = vy_1 ;
231
+ register __vector float vsrcx = vec_perm (vsrc , vsrc , swap_mask );
232
+ register __vector float vsrcx_1 = vec_perm (vsrc_1 , vsrc_1 , swap_mask );
233
+
234
+ vy_0 += vsrc * valpha_r + vsrcx * valpha_i ;
235
+ vy_1 += vsrc_1 * valpha_r + vsrcx_1 * valpha_i ;
236
+
237
+ vec_vsx_st (vy_0 ,i , vptr_y );
238
+ vec_vsx_st (vy_1 ,i2 ,vptr_y );
230
239
231
240
}
232
241
@@ -237,7 +246,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT
237
246
238
247
239
248
int CNAME (BLASLONG m , BLASLONG n , BLASLONG dummy1 , FLOAT alpha_r , FLOAT alpha_i , FLOAT * a , BLASLONG lda , FLOAT * x , BLASLONG inc_x , FLOAT * y , BLASLONG inc_y , FLOAT * buffer ) {
240
- BLASLONG i ;
249
+ BLASLONG i = 0 ;
241
250
FLOAT * a_ptr ;
242
251
FLOAT * x_ptr ;
243
252
FLOAT * y_ptr ;
@@ -247,8 +256,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
247
256
BLASLONG m2 ;
248
257
BLASLONG m3 ;
249
258
BLASLONG n2 ;
250
-
251
- FLOAT xbuffer [ 8 ], * ybuffer ;
259
+ FLOAT xbuffer [ 8 ] __attribute__(( aligned ( 16 )));
260
+ FLOAT * ybuffer ;
252
261
253
262
if (m < 1 ) return (0 );
254
263
if (n < 1 ) return (0 );
0 commit comments