Skip to content

Commit e79712d

Browse files
cgemv using vec_vsx_ld instead of letting gcc to decide
1 parent be09551 commit e79712d

File tree

2 files changed

+115
-79
lines changed

2 files changed

+115
-79
lines changed

kernel/power/cgemv_n.c

Lines changed: 53 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -62,23 +62,24 @@ static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA
6262
register __vector float vx3_r = {x[6], -x[6],x[6], -x[6]};
6363
register __vector float vx3_i = {x[7], x[7],x[7], x[7]};
6464
#endif
65-
register __vector float *vy = (__vector float *) y;
65+
register __vector float *vptr_y = (__vector float *) y;
6666
register __vector float *vptr_a0 = (__vector float *) a0;
6767
register __vector float *vptr_a1 = (__vector float *) a1;
6868
register __vector float *vptr_a2 = (__vector float *) a2;
6969
register __vector float *vptr_a3 = (__vector float *) a3;
7070
BLASLONG i = 0;
71-
for (;i< n / 2; i+=2) {
72-
register __vector float vy_0 = vy[i];
73-
register __vector float vy_1 = vy[i + 1];
74-
register __vector float va0 = vptr_a0[i];
75-
register __vector float va1 = vptr_a1[i];
76-
register __vector float va2 = vptr_a2[i];
77-
register __vector float va3 = vptr_a3[i];
78-
register __vector float va0_1 = vptr_a0[i + 1];
79-
register __vector float va1_1 = vptr_a1[i + 1];
80-
register __vector float va2_1 = vptr_a2[i + 1];
81-
register __vector float va3_1 = vptr_a3[i + 1];
71+
BLASLONG i2=16;
72+
for (;i< n * 8; i+=32,i2+=32) {
73+
register __vector float vy_0 = vec_vsx_ld(i,vptr_y);
74+
register __vector float vy_1 = vec_vsx_ld(i2,vptr_y);
75+
register __vector float va0 = vec_vsx_ld(i,vptr_a0);
76+
register __vector float va1 = vec_vsx_ld(i, vptr_a1);
77+
register __vector float va2 = vec_vsx_ld(i ,vptr_a2);
78+
register __vector float va3 = vec_vsx_ld(i ,vptr_a3);
79+
register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0);
80+
register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1);
81+
register __vector float va2_1 = vec_vsx_ld(i2 ,vptr_a2);
82+
register __vector float va3_1 = vec_vsx_ld(i2 ,vptr_a3);
8283

8384
vy_0 += va0*vx0_r + va1*vx1_r + va2*vx2_r + va3*vx3_r;
8485
vy_1 += va0_1*vx0_r + va1_1*vx1_r + va2_1*vx2_r + va3_1*vx3_r;
@@ -93,8 +94,8 @@ static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA
9394
vy_0 += va0*vx0_i + va1*vx1_i + va2*vx2_i + va3*vx3_i;
9495
vy_1 += va0_1*vx0_i + va1_1*vx1_i + va2_1*vx2_i + va3_1*vx3_i;
9596

96-
vy[i] = vy_0;
97-
vy[i + 1] = vy_1;
97+
vec_vsx_st(vy_0 ,i, vptr_y);
98+
vec_vsx_st(vy_1,i2,vptr_y);
9899
}
99100

100101
}
@@ -118,26 +119,28 @@ static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA
118119
register __vector float vx1_r = {x[2], -x[2],x[2], -x[2]};
119120
register __vector float vx1_i = {x[3], x[3],x[3], x[3]};
120121
#endif
121-
register __vector float *vy = (__vector float *) y;
122+
register __vector float *vptr_y = (__vector float *) y;
122123
register __vector float *vptr_a0 = (__vector float *) a0;
123124
register __vector float *vptr_a1 = (__vector float *) a1;
124-
BLASLONG i = 0;
125-
for (;i< n / 2; i+=2) {
126-
register __vector float vy_0 = vy[i];
127-
register __vector float vy_1 = vy[i + 1];
128-
register __vector float va0 = vptr_a0[i];
129-
register __vector float va1 = vptr_a1[i];
130-
register __vector float va0_1 = vptr_a0[i + 1];
131-
register __vector float va1_1 = vptr_a1[i + 1];
125+
BLASLONG i = 0;
126+
BLASLONG i2 = 16;
127+
for (;i< n * 8; i+=32, i2+=32) {
128+
register __vector float vy_0 = vec_vsx_ld(i,vptr_y);
129+
register __vector float vy_1 = vec_vsx_ld(i2,vptr_y);
130+
register __vector float va0 = vec_vsx_ld(i,vptr_a0);
131+
register __vector float va1 = vec_vsx_ld(i, vptr_a1);
132+
register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0);
133+
register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1);
134+
132135
register __vector float va0x = vec_perm(va0, va0,swap_mask);
133136
register __vector float va0x_1 = vec_perm(va0_1, va0_1,swap_mask);
134137
register __vector float va1x = vec_perm(va1, va1,swap_mask);
135138
register __vector float va1x_1 = vec_perm(va1_1, va1_1,swap_mask);
136139
vy_0 += va0*vx0_r + va1*vx1_r + va0x*vx0_i + va1x*vx1_i;
137140
vy_1 += va0_1*vx0_r + va1_1*vx1_r + va0x_1*vx0_i + va1x_1*vx1_i;
138141

139-
vy[i] = vy_0;
140-
vy[i + 1] = vy_1;
142+
vec_vsx_st(vy_0 ,i, vptr_y);
143+
vec_vsx_st(vy_1,i2,vptr_y);
141144
}
142145

143146
}
@@ -154,21 +157,23 @@ static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) {
154157
register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]};
155158
register __vector float vx0_i = {x[1], x[1],x[1], x[1]};
156159
#endif
157-
register __vector float *vy = (__vector float *) y;
160+
register __vector float *vptr_y = (__vector float *) y;
158161
register __vector float *vptr_a0 = (__vector float *) ap;
159162
BLASLONG i = 0;
160-
for (;i< n / 2; i+=2) {
161-
register __vector float vy_0 = vy[i];
162-
register __vector float vy_1 = vy[i + 1];
163-
register __vector float va0 = vptr_a0[i];
164-
register __vector float va0_1 = vptr_a0[i + 1];
163+
BLASLONG i2 = 16;
164+
for (;i< n * 8; i+=32, i2+=32) {
165+
register __vector float vy_0 = vec_vsx_ld(i,vptr_y);
166+
register __vector float vy_1 = vec_vsx_ld(i2,vptr_y);
167+
register __vector float va0 = vec_vsx_ld(i,vptr_a0);
168+
register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0);
169+
165170
register __vector float va0x = vec_perm(va0, va0,swap_mask);
166171
register __vector float va0x_1 = vec_perm(va0_1, va0_1,swap_mask);
167172
vy_0 += va0*vx0_r + va0x*vx0_i;
168173
vy_1 += va0_1*vx0_r + va0x_1*vx0_i;
169174

170-
vy[i] = vy_0;
171-
vy[i + 1] = vy_1;
175+
vec_vsx_st(vy_0 ,i, vptr_y);
176+
vec_vsx_st(vy_1,i2,vptr_y);
172177
}
173178
}
174179

@@ -213,20 +218,24 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT
213218

214219
register __vector float *vptr_src = (__vector float *) src;
215220
register __vector float *vptr_y = (__vector float *) dest;
216-
for (i = 0; i < n/2; i += 2 ){
217221

218-
register __vector float vy_0 = vptr_y[i];
219-
register __vector float vy_1 = vptr_y[i +1];
222+
BLASLONG i2 = 16;
223+
for (;i< n * 8; i+=32, i2+=32) {
224+
register __vector float vy_0 = vec_vsx_ld(i,vptr_y);
225+
register __vector float vy_1 = vec_vsx_ld(i2,vptr_y);
226+
227+
228+
register __vector float vsrc = vec_vsx_ld(i,vptr_src);
229+
register __vector float vsrc_1 = vec_vsx_ld(i2,vptr_src);
230+
231+
register __vector float vsrcx = vec_perm(vsrc, vsrc, swap_mask);
232+
register __vector float vsrcx_1 = vec_perm(vsrc_1, vsrc_1, swap_mask);
220233

221-
register __vector float vsrc = vptr_src[i];
222-
register __vector float vsrc_1 = vptr_src[i + 1];
223-
register __vector float vsrcx = vec_perm(vsrc, vsrc, swap_mask);
224-
register __vector float vsrcx_1 = vec_perm(vsrc_1, vsrc_1, swap_mask);
234+
vy_0 += vsrc*valpha_r + vsrcx*valpha_i;
235+
vy_1 += vsrc_1*valpha_r + vsrcx_1*valpha_i;
225236

226-
vy_0 += vsrc*valpha_r + vsrcx*valpha_i;
227-
vy_1 += vsrc_1*valpha_r + vsrcx_1*valpha_i;
228-
vptr_y[i] = vy_0;
229-
vptr_y[i+1 ] = vy_1;
237+
vec_vsx_st(vy_0 ,i, vptr_y);
238+
vec_vsx_st(vy_1,i2,vptr_y);
230239

231240
}
232241

kernel/power/cgemv_t.c

Lines changed: 62 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
3232
static const unsigned char __attribute__((aligned(16))) swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11};
3333

3434
static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {
35-
BLASLONG i;
35+
3636
FLOAT *a0, *a1, *a2, *a3;
3737
a0 = ap;
3838
a1 = ap + lda;
@@ -48,26 +48,39 @@ static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA
4848
register __vector float vtemp2_r = {0.0, 0.0,0.0,0.0};
4949
register __vector float vtemp3_p = {0.0, 0.0,0.0,0.0};
5050
register __vector float vtemp3_r = {0.0, 0.0,0.0,0.0};
51-
__vector float* va0 = (__vector float*) a0;
52-
__vector float* va1 = (__vector float*) a1;
53-
__vector float* va2 = (__vector float*) a2;
54-
__vector float* va3 = (__vector float*) a3;
51+
__vector float* vptr_a0 = (__vector float*) a0;
52+
__vector float* vptr_a1 = (__vector float*) a1;
53+
__vector float* vptr_a2 = (__vector float*) a2;
54+
__vector float* vptr_a3 = (__vector float*) a3;
5555
__vector float* v_x = (__vector float*) x;
5656

57-
for (i = 0; i < n / 2; i+=2) {
58-
register __vector float vx_0 = v_x[i];
59-
register __vector float vx_1 = v_x[i+1];
57+
BLASLONG i = 0;
58+
BLASLONG i2 = 16;
59+
for (;i< n * 8; i+=32, i2+=32) {
60+
register __vector float vx_0 = vec_vsx_ld( i,v_x) ;
61+
register __vector float vx_1 = vec_vsx_ld(i2, v_x);
62+
6063
register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask);
6164
register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask);
6265

63-
vtemp0_p += vx_0*va0[i] + vx_1*va0[i+1] ;
64-
vtemp0_r += vxr_0*va0[i] + vxr_1*va0[i+1];
65-
vtemp1_p += vx_0*va1[i] + vx_1*va1[i+1];
66-
vtemp1_r += vxr_0*va1[i] + vxr_1*va1[i+1];
67-
vtemp2_p += vx_0*va2[i] + vx_1*va2[i+1];
68-
vtemp2_r += vxr_0*va2[i] + vxr_1*va2[i+1];
69-
vtemp3_p += vx_0*va3[i] + vx_1*va3[i+1];
70-
vtemp3_r += vxr_0*va3[i] + vxr_1*va3[i+1];
66+
register __vector float va0 = vec_vsx_ld(i,vptr_a0);
67+
register __vector float va1 = vec_vsx_ld(i, vptr_a1);
68+
register __vector float va2 = vec_vsx_ld(i ,vptr_a2);
69+
register __vector float va3 = vec_vsx_ld(i ,vptr_a3);
70+
register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0);
71+
register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1);
72+
register __vector float va2_1 = vec_vsx_ld(i2 ,vptr_a2);
73+
register __vector float va3_1 = vec_vsx_ld(i2 ,vptr_a3);
74+
75+
76+
vtemp0_p += vx_0*va0 + vx_1*va0_1 ;
77+
vtemp0_r += vxr_0*va0 + vxr_1*va0_1;
78+
vtemp1_p += vx_0*va1 + vx_1*va1_1;
79+
vtemp1_r += vxr_0*va1 + vxr_1*va1_1;
80+
vtemp2_p += vx_0*va2 + vx_1*va2_1;
81+
vtemp2_r += vxr_0*va2 + vxr_1*va2_1;
82+
vtemp3_p += vx_0*va3 + vx_1*va3_1;
83+
vtemp3_r += vxr_0*va3 + vxr_1*va3_1;
7184

7285
}
7386

@@ -128,7 +141,7 @@ static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA
128141

129142

130143
static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {
131-
BLASLONG i;
144+
132145
FLOAT *a0, *a1;
133146
a0 = ap;
134147
a1 = ap + lda;
@@ -138,23 +151,33 @@ static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA
138151
register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0};
139152
register __vector float vtemp1_p = {0.0, 0.0,0.0,0.0};
140153
register __vector float vtemp1_r = {0.0, 0.0,0.0,0.0};
141-
__vector float* va0 = (__vector float*) a0;
142-
__vector float* va1 = (__vector float*) a1;
154+
155+
156+
__vector float* vptr_a0 = (__vector float*) a0;
157+
__vector float* vptr_a1 = (__vector float*) a1;
143158
__vector float* v_x = (__vector float*) x;
144159

145-
for (i = 0; i < n / 2; i+=2) {
146-
register __vector float vx_0 = v_x[i];
147-
register __vector float vx_1 = v_x[i+1];
160+
BLASLONG i = 0;
161+
BLASLONG i2 = 16;
162+
for (;i< n * 8; i+=32, i2+=32) {
163+
register __vector float vx_0 = vec_vsx_ld( i,v_x) ;
164+
register __vector float vx_1 = vec_vsx_ld(i2, v_x);
165+
148166
register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask);
149167
register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask);
150168

151-
vtemp0_p += vx_0*va0[i] + vx_1*va0[i+1] ;
152-
vtemp0_r += vxr_0*va0[i] + vxr_1*va0[i+1];
153-
vtemp1_p += vx_0*va1[i] + vx_1*va1[i+1];
154-
vtemp1_r += vxr_0*va1[i] + vxr_1*va1[i+1];
169+
register __vector float va0 = vec_vsx_ld(i,vptr_a0);
170+
register __vector float va1 = vec_vsx_ld(i, vptr_a1);
171+
register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0);
172+
register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1);
155173

156-
}
157174

175+
vtemp0_p += vx_0*va0 + vx_1*va0_1 ;
176+
vtemp0_r += vxr_0*va0 + vxr_1*va0_1;
177+
vtemp1_p += vx_0*va1 + vx_1*va1_1;
178+
vtemp1_r += vxr_0*va1 + vxr_1*va1_1;
179+
180+
}
158181
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
159182

160183
register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3];
@@ -193,23 +216,27 @@ static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA
193216

194217

195218
static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {
196-
BLASLONG i;
219+
197220
__vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
198221
//p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real)
199222
register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0};
200223
register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0};
201-
__vector float* va0 = (__vector float*) ap;
224+
__vector float* vptr_a0 = (__vector float*) ap;
202225
__vector float* v_x = (__vector float*) x;
203-
204-
for (i = 0; i < n / 2; i+=2) {
205-
register __vector float vx_0 = v_x[i];
206-
register __vector float vx_1 = v_x[i+1];
226+
BLASLONG i = 0;
227+
BLASLONG i2 = 16;
228+
for (;i< n * 8; i+=32, i2+=32) {
229+
register __vector float vx_0 = vec_vsx_ld( i,v_x) ;
230+
register __vector float vx_1 = vec_vsx_ld(i2, v_x);
231+
207232
register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask);
208233
register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask);
209234

210-
vtemp0_p += vx_0*va0[i] + vx_1*va0[i+1] ;
211-
vtemp0_r += vxr_0*va0[i] + vxr_1*va0[i+1];
235+
register __vector float va0 = vec_vsx_ld(i,vptr_a0);
236+
register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0);
212237

238+
vtemp0_p += vx_0*va0 + vx_1*va0_1 ;
239+
vtemp0_r += vxr_0*va0 + vxr_1*va0_1;
213240
}
214241

215242
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )

0 commit comments

Comments
 (0)