@@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32
32
static const unsigned char __attribute__((aligned (16 ))) swap_mask_arr []= { 4 ,5 ,6 ,7 ,0 ,1 ,2 ,3 , 12 ,13 ,14 ,15 , 8 ,9 ,10 ,11 };
33
33
34
34
static void cgemv_kernel_4x4 (BLASLONG n , BLASLONG lda , FLOAT * ap , FLOAT * x , FLOAT * y , FLOAT alpha_r , FLOAT alpha_i ) {
35
- BLASLONG i ;
35
+
36
36
FLOAT * a0 , * a1 , * a2 , * a3 ;
37
37
a0 = ap ;
38
38
a1 = ap + lda ;
@@ -48,26 +48,39 @@ static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA
48
48
register __vector float vtemp2_r = {0.0 , 0.0 ,0.0 ,0.0 };
49
49
register __vector float vtemp3_p = {0.0 , 0.0 ,0.0 ,0.0 };
50
50
register __vector float vtemp3_r = {0.0 , 0.0 ,0.0 ,0.0 };
51
- __vector float * va0 = (__vector float * ) a0 ;
52
- __vector float * va1 = (__vector float * ) a1 ;
53
- __vector float * va2 = (__vector float * ) a2 ;
54
- __vector float * va3 = (__vector float * ) a3 ;
51
+ __vector float * vptr_a0 = (__vector float * ) a0 ;
52
+ __vector float * vptr_a1 = (__vector float * ) a1 ;
53
+ __vector float * vptr_a2 = (__vector float * ) a2 ;
54
+ __vector float * vptr_a3 = (__vector float * ) a3 ;
55
55
__vector float * v_x = (__vector float * ) x ;
56
56
57
- for (i = 0 ; i < n / 2 ; i += 2 ) {
58
- register __vector float vx_0 = v_x [i ];
59
- register __vector float vx_1 = v_x [i + 1 ];
57
+ BLASLONG i = 0 ;
58
+ BLASLONG i2 = 16 ;
59
+ for (;i < n * 8 ; i += 32 , i2 += 32 ) {
60
+ register __vector float vx_0 = vec_vsx_ld ( i ,v_x ) ;
61
+ register __vector float vx_1 = vec_vsx_ld (i2 , v_x );
62
+
60
63
register __vector float vxr_0 = vec_perm (vx_0 , vx_0 , swap_mask );
61
64
register __vector float vxr_1 = vec_perm (vx_1 , vx_1 , swap_mask );
62
65
63
- vtemp0_p += vx_0 * va0 [i ] + vx_1 * va0 [i + 1 ] ;
64
- vtemp0_r += vxr_0 * va0 [i ] + vxr_1 * va0 [i + 1 ];
65
- vtemp1_p += vx_0 * va1 [i ] + vx_1 * va1 [i + 1 ];
66
- vtemp1_r += vxr_0 * va1 [i ] + vxr_1 * va1 [i + 1 ];
67
- vtemp2_p += vx_0 * va2 [i ] + vx_1 * va2 [i + 1 ];
68
- vtemp2_r += vxr_0 * va2 [i ] + vxr_1 * va2 [i + 1 ];
69
- vtemp3_p += vx_0 * va3 [i ] + vx_1 * va3 [i + 1 ];
70
- vtemp3_r += vxr_0 * va3 [i ] + vxr_1 * va3 [i + 1 ];
66
+ register __vector float va0 = vec_vsx_ld (i ,vptr_a0 );
67
+ register __vector float va1 = vec_vsx_ld (i , vptr_a1 );
68
+ register __vector float va2 = vec_vsx_ld (i ,vptr_a2 );
69
+ register __vector float va3 = vec_vsx_ld (i ,vptr_a3 );
70
+ register __vector float va0_1 = vec_vsx_ld (i2 ,vptr_a0 );
71
+ register __vector float va1_1 = vec_vsx_ld (i2 ,vptr_a1 );
72
+ register __vector float va2_1 = vec_vsx_ld (i2 ,vptr_a2 );
73
+ register __vector float va3_1 = vec_vsx_ld (i2 ,vptr_a3 );
74
+
75
+
76
+ vtemp0_p += vx_0 * va0 + vx_1 * va0_1 ;
77
+ vtemp0_r += vxr_0 * va0 + vxr_1 * va0_1 ;
78
+ vtemp1_p += vx_0 * va1 + vx_1 * va1_1 ;
79
+ vtemp1_r += vxr_0 * va1 + vxr_1 * va1_1 ;
80
+ vtemp2_p += vx_0 * va2 + vx_1 * va2_1 ;
81
+ vtemp2_r += vxr_0 * va2 + vxr_1 * va2_1 ;
82
+ vtemp3_p += vx_0 * va3 + vx_1 * va3_1 ;
83
+ vtemp3_r += vxr_0 * va3 + vxr_1 * va3_1 ;
71
84
72
85
}
73
86
@@ -128,7 +141,7 @@ static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA
128
141
129
142
130
143
static void cgemv_kernel_4x2 (BLASLONG n , BLASLONG lda , FLOAT * ap , FLOAT * x , FLOAT * y , FLOAT alpha_r , FLOAT alpha_i ) {
131
- BLASLONG i ;
144
+
132
145
FLOAT * a0 , * a1 ;
133
146
a0 = ap ;
134
147
a1 = ap + lda ;
@@ -138,23 +151,33 @@ static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA
138
151
register __vector float vtemp0_r = {0.0 , 0.0 ,0.0 ,0.0 };
139
152
register __vector float vtemp1_p = {0.0 , 0.0 ,0.0 ,0.0 };
140
153
register __vector float vtemp1_r = {0.0 , 0.0 ,0.0 ,0.0 };
141
- __vector float * va0 = (__vector float * ) a0 ;
142
- __vector float * va1 = (__vector float * ) a1 ;
154
+
155
+
156
+ __vector float * vptr_a0 = (__vector float * ) a0 ;
157
+ __vector float * vptr_a1 = (__vector float * ) a1 ;
143
158
__vector float * v_x = (__vector float * ) x ;
144
159
145
- for (i = 0 ; i < n / 2 ; i += 2 ) {
146
- register __vector float vx_0 = v_x [i ];
147
- register __vector float vx_1 = v_x [i + 1 ];
160
+ BLASLONG i = 0 ;
161
+ BLASLONG i2 = 16 ;
162
+ for (;i < n * 8 ; i += 32 , i2 += 32 ) {
163
+ register __vector float vx_0 = vec_vsx_ld ( i ,v_x ) ;
164
+ register __vector float vx_1 = vec_vsx_ld (i2 , v_x );
165
+
148
166
register __vector float vxr_0 = vec_perm (vx_0 , vx_0 , swap_mask );
149
167
register __vector float vxr_1 = vec_perm (vx_1 , vx_1 , swap_mask );
150
168
151
- vtemp0_p += vx_0 * va0 [ i ] + vx_1 * va0 [ i + 1 ] ;
152
- vtemp0_r += vxr_0 * va0 [ i ] + vxr_1 * va0 [ i + 1 ] ;
153
- vtemp1_p += vx_0 * va1 [ i ] + vx_1 * va1 [ i + 1 ] ;
154
- vtemp1_r += vxr_0 * va1 [ i ] + vxr_1 * va1 [ i + 1 ];
169
+ register __vector float va0 = vec_vsx_ld ( i , vptr_a0 ) ;
170
+ register __vector float va1 = vec_vsx_ld ( i , vptr_a1 ) ;
171
+ register __vector float va0_1 = vec_vsx_ld ( i2 , vptr_a0 ) ;
172
+ register __vector float va1_1 = vec_vsx_ld ( i2 , vptr_a1 );
155
173
156
- }
157
174
175
+ vtemp0_p += vx_0 * va0 + vx_1 * va0_1 ;
176
+ vtemp0_r += vxr_0 * va0 + vxr_1 * va0_1 ;
177
+ vtemp1_p += vx_0 * va1 + vx_1 * va1_1 ;
178
+ vtemp1_r += vxr_0 * va1 + vxr_1 * va1_1 ;
179
+
180
+ }
158
181
#if ( !defined(CONJ ) && !defined(XCONJ ) ) || ( defined(CONJ ) && defined(XCONJ ) )
159
182
160
183
register FLOAT temp_r0 = vtemp0_p [0 ] - vtemp0_p [1 ] + vtemp0_p [2 ] - vtemp0_p [3 ];
@@ -193,23 +216,27 @@ static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA
193
216
194
217
195
218
static void cgemv_kernel_4x1 (BLASLONG n , FLOAT * ap , FLOAT * x , FLOAT * y , FLOAT alpha_r , FLOAT alpha_i ) {
196
- BLASLONG i ;
219
+
197
220
__vector unsigned char swap_mask = * ((__vector unsigned char * )swap_mask_arr );
198
221
//p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real)
199
222
register __vector float vtemp0_p = {0.0 , 0.0 ,0.0 ,0.0 };
200
223
register __vector float vtemp0_r = {0.0 , 0.0 ,0.0 ,0.0 };
201
- __vector float * va0 = (__vector float * ) ap ;
224
+ __vector float * vptr_a0 = (__vector float * ) ap ;
202
225
__vector float * v_x = (__vector float * ) x ;
203
-
204
- for (i = 0 ; i < n / 2 ; i += 2 ) {
205
- register __vector float vx_0 = v_x [i ];
206
- register __vector float vx_1 = v_x [i + 1 ];
226
+ BLASLONG i = 0 ;
227
+ BLASLONG i2 = 16 ;
228
+ for (;i < n * 8 ; i += 32 , i2 += 32 ) {
229
+ register __vector float vx_0 = vec_vsx_ld ( i ,v_x ) ;
230
+ register __vector float vx_1 = vec_vsx_ld (i2 , v_x );
231
+
207
232
register __vector float vxr_0 = vec_perm (vx_0 , vx_0 , swap_mask );
208
233
register __vector float vxr_1 = vec_perm (vx_1 , vx_1 , swap_mask );
209
234
210
- vtemp0_p += vx_0 * va0 [ i ] + vx_1 * va0 [ i + 1 ] ;
211
- vtemp0_r += vxr_0 * va0 [ i ] + vxr_1 * va0 [ i + 1 ] ;
235
+ register __vector float va0 = vec_vsx_ld ( i , vptr_a0 );
236
+ register __vector float va0_1 = vec_vsx_ld ( i2 , vptr_a0 ) ;
212
237
238
+ vtemp0_p += vx_0 * va0 + vx_1 * va0_1 ;
239
+ vtemp0_r += vxr_0 * va0 + vxr_1 * va0_1 ;
213
240
}
214
241
215
242
#if ( !defined(CONJ ) && !defined(XCONJ ) ) || ( defined(CONJ ) && defined(XCONJ ) )
0 commit comments