Skip to content

Commit f3c3145

Browse files
authored
Merge pull request #2243 from quickwritereader/develop
possible cgemv,caxpy,cdot fix
2 parents ec1ef6a + 847c20c commit f3c3145

File tree

12 files changed

+217
-162
lines changed

12 files changed

+217
-162
lines changed

kernel/power/caxpy.c

Lines changed: 43 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,21 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
2424
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
2525
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2626
*****************************************************************************/
27-
2827
#include "common.h"
29-
30-
3128
#ifndef HAVE_ASM_KERNEL
3229
#include <altivec.h>
30+
31+
#define offset_0 0
32+
#define offset_1 16
33+
#define offset_2 32
34+
#define offset_3 48
35+
#define offset_4 64
36+
#define offset_5 80
37+
#define offset_6 96
38+
#define offset_7 112
39+
40+
static const unsigned char __attribute__((aligned(16))) swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11};
41+
3342
static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i)
3443
{
3544

@@ -43,28 +52,29 @@ static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT
4352
register __vector float valpha_i = {alpha_i, alpha_i,alpha_i, alpha_i};
4453
#endif
4554

46-
__vector unsigned char swap_mask = { 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11};
47-
register __vector float *vy = (__vector float *) y;
48-
register __vector float *vx = (__vector float *) x;
55+
__vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
56+
register __vector float *vptr_y = (__vector float *) y;
57+
register __vector float *vptr_x = (__vector float *) x;
4958
BLASLONG i=0;
50-
for (; i < n/2; i += 8) {
59+
for(;i<n/2;i+=8){
60+
61+
register __vector float vy_0 = vec_vsx_ld( offset_0 ,vptr_y ) ;
62+
register __vector float vy_1 = vec_vsx_ld( offset_1 ,vptr_y ) ;
63+
register __vector float vy_2 = vec_vsx_ld( offset_2 ,vptr_y ) ;
64+
register __vector float vy_3 = vec_vsx_ld( offset_3 ,vptr_y ) ;
65+
register __vector float vy_4 = vec_vsx_ld( offset_4 ,vptr_y ) ;
66+
register __vector float vy_5 = vec_vsx_ld( offset_5 ,vptr_y ) ;
67+
register __vector float vy_6 = vec_vsx_ld( offset_6 ,vptr_y ) ;
68+
register __vector float vy_7 = vec_vsx_ld( offset_7 ,vptr_y ) ;
5169

52-
register __vector float vy_0 = vy[i];
53-
register __vector float vy_1 = vy[i + 1];
54-
register __vector float vy_2 = vy[i + 2];
55-
register __vector float vy_3 = vy[i + 3];
56-
register __vector float vy_4 = vy[i + 4];
57-
register __vector float vy_5 = vy[i + 5];
58-
register __vector float vy_6 = vy[i + 6];
59-
register __vector float vy_7 = vy[i + 7];
60-
register __vector float vx_0 = vx[i];
61-
register __vector float vx_1 = vx[i + 1];
62-
register __vector float vx_2 = vx[i + 2];
63-
register __vector float vx_3 = vx[i + 3];
64-
register __vector float vx_4 = vx[i + 4];
65-
register __vector float vx_5 = vx[i + 5];
66-
register __vector float vx_6 = vx[i + 6];
67-
register __vector float vx_7 = vx[i + 7];
70+
register __vector float vx_0 = vec_vsx_ld( offset_0 ,vptr_x ) ;
71+
register __vector float vx_1 = vec_vsx_ld( offset_1 ,vptr_x ) ;
72+
register __vector float vx_2 = vec_vsx_ld( offset_2 ,vptr_x ) ;
73+
register __vector float vx_3 = vec_vsx_ld( offset_3 ,vptr_x ) ;
74+
register __vector float vx_4 = vec_vsx_ld( offset_4 ,vptr_x ) ;
75+
register __vector float vx_5 = vec_vsx_ld( offset_5 ,vptr_x ) ;
76+
register __vector float vx_6 = vec_vsx_ld( offset_6 ,vptr_x ) ;
77+
register __vector float vx_7 = vec_vsx_ld( offset_7 ,vptr_x ) ;
6878
vy_0 += vx_0*valpha_r;
6979
vy_1 += vx_1*valpha_r;
7080
vy_2 += vx_2*valpha_r;
@@ -89,15 +99,17 @@ static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT
8999
vy_5 += vx_5*valpha_i;
90100
vy_6 += vx_6*valpha_i;
91101
vy_7 += vx_7*valpha_i;
92-
vy[i] = vy_0;
93-
vy[i + 1] = vy_1;
94-
vy[i + 2] = vy_2;
95-
vy[i + 3] = vy_3;
96-
vy[i + 4] = vy_4;
97-
vy[i + 5] = vy_5 ;
98-
vy[i + 6] = vy_6 ;
99-
vy[i + 7] = vy_7 ;
102+
vec_vsx_st( vy_0, offset_0 ,vptr_y ) ;
103+
vec_vsx_st( vy_1, offset_1 ,vptr_y ) ;
104+
vec_vsx_st( vy_2, offset_2 ,vptr_y ) ;
105+
vec_vsx_st( vy_3, offset_3 ,vptr_y ) ;
106+
vec_vsx_st( vy_4, offset_4 ,vptr_y ) ;
107+
vec_vsx_st( vy_5, offset_5 ,vptr_y ) ;
108+
vec_vsx_st( vy_6, offset_6 ,vptr_y ) ;
109+
vec_vsx_st( vy_7, offset_7 ,vptr_y ) ;
100110

111+
vptr_x+=8;
112+
vptr_y+=8;
101113
}
102114
}
103115
#endif

kernel/power/cdot.c

Lines changed: 32 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -25,42 +25,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2525
*****************************************************************************/
2626

2727
#include "common.h"
28-
2928
#ifndef HAVE_KERNEL_8
3029
#include <altivec.h>
30+
31+
#define offset_0 0
32+
#define offset_1 16
33+
#define offset_2 32
34+
#define offset_3 48
35+
36+
37+
38+
static const unsigned char __attribute__((aligned(16))) swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11};
3139
static void cdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, float *dot)
3240
{
33-
__vector unsigned char swap_mask = { 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11};
34-
register __vector float *vy = (__vector float *) y;
35-
register __vector float *vx = (__vector float *) x;
36-
BLASLONG i = 0;
41+
__vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
42+
register __vector float *vptr_y = (__vector float *) y;
43+
register __vector float *vptr_x = (__vector float *) x;
3744
register __vector float vd_0 = { 0 };
3845
register __vector float vd_1 = { 0 };
3946
register __vector float vd_2 = { 0 };
4047
register __vector float vd_3 = { 0 };
4148
register __vector float vdd_0 = { 0 };
4249
register __vector float vdd_1 = { 0 };
4350
register __vector float vdd_2 = { 0 };
44-
register __vector float vdd_3 = { 0 };
45-
for (; i < n/2; i += 4) {
46-
47-
register __vector float vyy_0 ;
48-
register __vector float vyy_1 ;
49-
register __vector float vyy_2 ;
50-
register __vector float vyy_3 ;
51-
52-
register __vector float vy_0 = vy[i];
53-
register __vector float vy_1 = vy[i + 1];
54-
register __vector float vy_2 = vy[i + 2];
55-
register __vector float vy_3 = vy[i + 3];
56-
register __vector float vx_0= vx[i];
57-
register __vector float vx_1 = vx[i + 1];
58-
register __vector float vx_2 = vx[i + 2];
59-
register __vector float vx_3 = vx[i + 3];
60-
vyy_0 = vec_perm(vy_0, vy_0, swap_mask);
61-
vyy_1 = vec_perm(vy_1, vy_1, swap_mask);
62-
vyy_2 = vec_perm(vy_2, vy_2, swap_mask);
63-
vyy_3 = vec_perm(vy_3, vy_3, swap_mask);
51+
register __vector float vdd_3 = { 0 };
52+
BLASLONG i=0;
53+
for(;i<n/2;i+=4){
54+
55+
register __vector float vy_0 = vec_vsx_ld( offset_0 ,vptr_y ) ;
56+
register __vector float vy_1 = vec_vsx_ld( offset_1 ,vptr_y ) ;
57+
register __vector float vy_2 = vec_vsx_ld( offset_2 ,vptr_y ) ;
58+
register __vector float vy_3 = vec_vsx_ld( offset_3 ,vptr_y ) ;
59+
60+
register __vector float vx_0 = vec_vsx_ld( offset_0 ,vptr_x ) ;
61+
register __vector float vx_1 = vec_vsx_ld( offset_1 ,vptr_x ) ;
62+
register __vector float vyy_0 = vec_perm(vy_0, vy_0, swap_mask);
63+
register __vector float vyy_1 = vec_perm(vy_1, vy_1, swap_mask);
64+
register __vector float vx_2 = vec_vsx_ld( offset_2 ,vptr_x ) ;
65+
register __vector float vx_3 = vec_vsx_ld( offset_3 ,vptr_x ) ;
66+
register __vector float vyy_2 = vec_perm(vy_2, vy_2, swap_mask);
67+
register __vector float vyy_3 = vec_perm(vy_3, vy_3, swap_mask);
6468

6569
vd_0 += vx_0 * vy_0;
6670
vd_1 += vx_1 * vy_1;
@@ -72,6 +76,8 @@ static void cdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, float *dot)
7276
vdd_2 += vx_2 * vyy_2;
7377
vdd_3 += vx_3 * vyy_3;
7478

79+
vptr_x+=4;
80+
vptr_y+=4;
7581

7682
}
7783
//aggregate
@@ -96,7 +102,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
96102
BLASLONG i = 0;
97103
BLASLONG ix=0, iy=0;
98104
OPENBLAS_COMPLEX_FLOAT result;
99-
FLOAT dot[4] __attribute__ ((aligned(16))) = {0.0, 0.0, 0.0, 0.0};
105+
FLOAT dot[4] __attribute__((aligned(16))) = {0.0, 0.0, 0.0, 0.0};
100106

101107
if (n <= 0) {
102108
CREAL(result) = 0.0;

kernel/power/cgemv_n.c

Lines changed: 58 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
3232
#define NBMAX 1024
3333

3434

35-
static const unsigned char swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11};
35+
static const unsigned char __attribute__((aligned(16))) swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11};
3636

3737

3838
static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) {
@@ -62,23 +62,24 @@ static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA
6262
register __vector float vx3_r = {x[6], -x[6],x[6], -x[6]};
6363
register __vector float vx3_i = {x[7], x[7],x[7], x[7]};
6464
#endif
65-
register __vector float *vy = (__vector float *) y;
65+
register __vector float *vptr_y = (__vector float *) y;
6666
register __vector float *vptr_a0 = (__vector float *) a0;
6767
register __vector float *vptr_a1 = (__vector float *) a1;
6868
register __vector float *vptr_a2 = (__vector float *) a2;
6969
register __vector float *vptr_a3 = (__vector float *) a3;
7070
BLASLONG i = 0;
71-
for (;i< n / 2; i+=2) {
72-
register __vector float vy_0 = vy[i];
73-
register __vector float vy_1 = vy[i + 1];
74-
register __vector float va0 = vptr_a0[i];
75-
register __vector float va1 = vptr_a1[i];
76-
register __vector float va2 = vptr_a2[i];
77-
register __vector float va3 = vptr_a3[i];
78-
register __vector float va0_1 = vptr_a0[i + 1];
79-
register __vector float va1_1 = vptr_a1[i + 1];
80-
register __vector float va2_1 = vptr_a2[i + 1];
81-
register __vector float va3_1 = vptr_a3[i + 1];
71+
BLASLONG i2=16;
72+
for (;i< n * 8; i+=32,i2+=32) {
73+
register __vector float vy_0 = vec_vsx_ld(i,vptr_y);
74+
register __vector float vy_1 = vec_vsx_ld(i2,vptr_y);
75+
register __vector float va0 = vec_vsx_ld(i,vptr_a0);
76+
register __vector float va1 = vec_vsx_ld(i, vptr_a1);
77+
register __vector float va2 = vec_vsx_ld(i ,vptr_a2);
78+
register __vector float va3 = vec_vsx_ld(i ,vptr_a3);
79+
register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0);
80+
register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1);
81+
register __vector float va2_1 = vec_vsx_ld(i2 ,vptr_a2);
82+
register __vector float va3_1 = vec_vsx_ld(i2 ,vptr_a3);
8283

8384
vy_0 += va0*vx0_r + va1*vx1_r + va2*vx2_r + va3*vx3_r;
8485
vy_1 += va0_1*vx0_r + va1_1*vx1_r + va2_1*vx2_r + va3_1*vx3_r;
@@ -93,8 +94,8 @@ static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA
9394
vy_0 += va0*vx0_i + va1*vx1_i + va2*vx2_i + va3*vx3_i;
9495
vy_1 += va0_1*vx0_i + va1_1*vx1_i + va2_1*vx2_i + va3_1*vx3_i;
9596

96-
vy[i] = vy_0;
97-
vy[i + 1] = vy_1;
97+
vec_vsx_st(vy_0 ,i, vptr_y);
98+
vec_vsx_st(vy_1,i2,vptr_y);
9899
}
99100

100101
}
@@ -118,26 +119,28 @@ static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA
118119
register __vector float vx1_r = {x[2], -x[2],x[2], -x[2]};
119120
register __vector float vx1_i = {x[3], x[3],x[3], x[3]};
120121
#endif
121-
register __vector float *vy = (__vector float *) y;
122+
register __vector float *vptr_y = (__vector float *) y;
122123
register __vector float *vptr_a0 = (__vector float *) a0;
123124
register __vector float *vptr_a1 = (__vector float *) a1;
124-
BLASLONG i = 0;
125-
for (;i< n / 2; i+=2) {
126-
register __vector float vy_0 = vy[i];
127-
register __vector float vy_1 = vy[i + 1];
128-
register __vector float va0 = vptr_a0[i];
129-
register __vector float va1 = vptr_a1[i];
130-
register __vector float va0_1 = vptr_a0[i + 1];
131-
register __vector float va1_1 = vptr_a1[i + 1];
125+
BLASLONG i = 0;
126+
BLASLONG i2 = 16;
127+
for (;i< n * 8; i+=32, i2+=32) {
128+
register __vector float vy_0 = vec_vsx_ld(i,vptr_y);
129+
register __vector float vy_1 = vec_vsx_ld(i2,vptr_y);
130+
register __vector float va0 = vec_vsx_ld(i,vptr_a0);
131+
register __vector float va1 = vec_vsx_ld(i, vptr_a1);
132+
register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0);
133+
register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1);
134+
132135
register __vector float va0x = vec_perm(va0, va0,swap_mask);
133136
register __vector float va0x_1 = vec_perm(va0_1, va0_1,swap_mask);
134137
register __vector float va1x = vec_perm(va1, va1,swap_mask);
135138
register __vector float va1x_1 = vec_perm(va1_1, va1_1,swap_mask);
136139
vy_0 += va0*vx0_r + va1*vx1_r + va0x*vx0_i + va1x*vx1_i;
137140
vy_1 += va0_1*vx0_r + va1_1*vx1_r + va0x_1*vx0_i + va1x_1*vx1_i;
138141

139-
vy[i] = vy_0;
140-
vy[i + 1] = vy_1;
142+
vec_vsx_st(vy_0 ,i, vptr_y);
143+
vec_vsx_st(vy_1,i2,vptr_y);
141144
}
142145

143146
}
@@ -154,29 +157,31 @@ static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) {
154157
register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]};
155158
register __vector float vx0_i = {x[1], x[1],x[1], x[1]};
156159
#endif
157-
register __vector float *vy = (__vector float *) y;
160+
register __vector float *vptr_y = (__vector float *) y;
158161
register __vector float *vptr_a0 = (__vector float *) ap;
159162
BLASLONG i = 0;
160-
for (;i< n / 2; i+=2) {
161-
register __vector float vy_0 = vy[i];
162-
register __vector float vy_1 = vy[i + 1];
163-
register __vector float va0 = vptr_a0[i];
164-
register __vector float va0_1 = vptr_a0[i + 1];
163+
BLASLONG i2 = 16;
164+
for (;i< n * 8; i+=32, i2+=32) {
165+
register __vector float vy_0 = vec_vsx_ld(i,vptr_y);
166+
register __vector float vy_1 = vec_vsx_ld(i2,vptr_y);
167+
register __vector float va0 = vec_vsx_ld(i,vptr_a0);
168+
register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0);
169+
165170
register __vector float va0x = vec_perm(va0, va0,swap_mask);
166171
register __vector float va0x_1 = vec_perm(va0_1, va0_1,swap_mask);
167172
vy_0 += va0*vx0_r + va0x*vx0_i;
168173
vy_1 += va0_1*vx0_r + va0x_1*vx0_i;
169174

170-
vy[i] = vy_0;
171-
vy[i + 1] = vy_1;
175+
vec_vsx_st(vy_0 ,i, vptr_y);
176+
vec_vsx_st(vy_1,i2,vptr_y);
172177
}
173178
}
174179

175180

176181

177182

178183
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) {
179-
BLASLONG i;
184+
BLASLONG i=0;
180185

181186

182187
if (inc_dest != 2) {
@@ -213,20 +218,24 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT
213218

214219
register __vector float *vptr_src = (__vector float *) src;
215220
register __vector float *vptr_y = (__vector float *) dest;
216-
for (i = 0; i < n/2; i += 2 ){
217221

218-
register __vector float vy_0 = vptr_y[i];
219-
register __vector float vy_1 = vptr_y[i +1];
222+
BLASLONG i2 = 16;
223+
for (;i< n * 8; i+=32, i2+=32) {
224+
register __vector float vy_0 = vec_vsx_ld(i,vptr_y);
225+
register __vector float vy_1 = vec_vsx_ld(i2,vptr_y);
226+
220227

221-
register __vector float vsrc = vptr_src[i];
222-
register __vector float vsrc_1 = vptr_src[i + 1];
223-
register __vector float vsrcx = vec_perm(vsrc, vsrc, swap_mask);
224-
register __vector float vsrcx_1 = vec_perm(vsrc_1, vsrc_1, swap_mask);
228+
register __vector float vsrc = vec_vsx_ld(i,vptr_src);
229+
register __vector float vsrc_1 = vec_vsx_ld(i2,vptr_src);
225230

226-
vy_0 += vsrc*valpha_r + vsrcx*valpha_i;
227-
vy_1 += vsrc_1*valpha_r + vsrcx_1*valpha_i;
228-
vptr_y[i] = vy_0;
229-
vptr_y[i+1 ] = vy_1;
231+
register __vector float vsrcx = vec_perm(vsrc, vsrc, swap_mask);
232+
register __vector float vsrcx_1 = vec_perm(vsrc_1, vsrc_1, swap_mask);
233+
234+
vy_0 += vsrc*valpha_r + vsrcx*valpha_i;
235+
vy_1 += vsrc_1*valpha_r + vsrcx_1*valpha_i;
236+
237+
vec_vsx_st(vy_0 ,i, vptr_y);
238+
vec_vsx_st(vy_1,i2,vptr_y);
230239

231240
}
232241

@@ -237,7 +246,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT
237246

238247

239248
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT * buffer) {
240-
BLASLONG i;
249+
BLASLONG i=0;
241250
FLOAT *a_ptr;
242251
FLOAT *x_ptr;
243252
FLOAT *y_ptr;
@@ -247,8 +256,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
247256
BLASLONG m2;
248257
BLASLONG m3;
249258
BLASLONG n2;
250-
251-
FLOAT xbuffer[8], *ybuffer;
259+
FLOAT xbuffer[8] __attribute__((aligned(16)));
260+
FLOAT *ybuffer;
252261

253262
if (m < 1) return (0);
254263
if (n < 1) return (0);

0 commit comments

Comments
 (0)