Skip to content

Commit 4c22828

Browse files
caxpy and cdot are using vec_vsx_ld
1 parent e79712d commit 4c22828

File tree

2 files changed

+69
-50
lines changed

2 files changed

+69
-50
lines changed

kernel/power/caxpy.c

Lines changed: 40 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2727
#include "common.h"
2828
#ifndef HAVE_ASM_KERNEL
2929
#include <altivec.h>
30+
31+
#define offset_0 0
32+
#define offset_1 16
33+
#define offset_2 32
34+
#define offset_3 48
35+
#define offset_4 64
36+
#define offset_5 80
37+
#define offset_6 96
38+
#define offset_7 112
39+
3040
static const unsigned char __attribute__((aligned(16))) swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11};
3141

3242
static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i)
@@ -43,27 +53,28 @@ static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT
4353
#endif
4454

4555
__vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
46-
register __vector float *vy = (__vector float *) y;
47-
register __vector float *vx = (__vector float *) x;
56+
register __vector float *vptr_y = (__vector float *) y;
57+
register __vector float *vptr_x = (__vector float *) x;
4858
BLASLONG i=0;
49-
for (; i < n/2; i += 8) {
59+
for(;i<n/2;i+=8){
60+
61+
register __vector float vy_0 = vec_vsx_ld( offset_0 ,vptr_y ) ;
62+
register __vector float vy_1 = vec_vsx_ld( offset_1 ,vptr_y ) ;
63+
register __vector float vy_2 = vec_vsx_ld( offset_2 ,vptr_y ) ;
64+
register __vector float vy_3 = vec_vsx_ld( offset_3 ,vptr_y ) ;
65+
register __vector float vy_4 = vec_vsx_ld( offset_4 ,vptr_y ) ;
66+
register __vector float vy_5 = vec_vsx_ld( offset_5 ,vptr_y ) ;
67+
register __vector float vy_6 = vec_vsx_ld( offset_6 ,vptr_y ) ;
68+
register __vector float vy_7 = vec_vsx_ld( offset_7 ,vptr_y ) ;
5069

51-
register __vector float vy_0 = vy[i];
52-
register __vector float vy_1 = vy[i + 1];
53-
register __vector float vy_2 = vy[i + 2];
54-
register __vector float vy_3 = vy[i + 3];
55-
register __vector float vy_4 = vy[i + 4];
56-
register __vector float vy_5 = vy[i + 5];
57-
register __vector float vy_6 = vy[i + 6];
58-
register __vector float vy_7 = vy[i + 7];
59-
register __vector float vx_0 = vx[i];
60-
register __vector float vx_1 = vx[i + 1];
61-
register __vector float vx_2 = vx[i + 2];
62-
register __vector float vx_3 = vx[i + 3];
63-
register __vector float vx_4 = vx[i + 4];
64-
register __vector float vx_5 = vx[i + 5];
65-
register __vector float vx_6 = vx[i + 6];
66-
register __vector float vx_7 = vx[i + 7];
70+
register __vector float vx_0 = vec_vsx_ld( offset_0 ,vptr_x ) ;
71+
register __vector float vx_1 = vec_vsx_ld( offset_1 ,vptr_x ) ;
72+
register __vector float vx_2 = vec_vsx_ld( offset_2 ,vptr_x ) ;
73+
register __vector float vx_3 = vec_vsx_ld( offset_3 ,vptr_x ) ;
74+
register __vector float vx_4 = vec_vsx_ld( offset_4 ,vptr_x ) ;
75+
register __vector float vx_5 = vec_vsx_ld( offset_5 ,vptr_x ) ;
76+
register __vector float vx_6 = vec_vsx_ld( offset_6 ,vptr_x ) ;
77+
register __vector float vx_7 = vec_vsx_ld( offset_7 ,vptr_x ) ;
6778
vy_0 += vx_0*valpha_r;
6879
vy_1 += vx_1*valpha_r;
6980
vy_2 += vx_2*valpha_r;
@@ -88,15 +99,17 @@ static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT
8899
vy_5 += vx_5*valpha_i;
89100
vy_6 += vx_6*valpha_i;
90101
vy_7 += vx_7*valpha_i;
91-
vy[i] = vy_0;
92-
vy[i + 1] = vy_1;
93-
vy[i + 2] = vy_2;
94-
vy[i + 3] = vy_3;
95-
vy[i + 4] = vy_4;
96-
vy[i + 5] = vy_5 ;
97-
vy[i + 6] = vy_6 ;
98-
vy[i + 7] = vy_7 ;
102+
vec_vsx_st( vy_0, offset_0 ,vptr_y ) ;
103+
vec_vsx_st( vy_1, offset_1 ,vptr_y ) ;
104+
vec_vsx_st( vy_2, offset_2 ,vptr_y ) ;
105+
vec_vsx_st( vy_3, offset_3 ,vptr_y ) ;
106+
vec_vsx_st( vy_4, offset_4 ,vptr_y ) ;
107+
vec_vsx_st( vy_5, offset_5 ,vptr_y ) ;
108+
vec_vsx_st( vy_6, offset_6 ,vptr_y ) ;
109+
vec_vsx_st( vy_7, offset_7 ,vptr_y ) ;
99110

111+
vptr_x+=8;
112+
vptr_y+=8;
100113
}
101114
}
102115
#endif

kernel/power/cdot.c

Lines changed: 29 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -27,40 +27,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2727
#include "common.h"
2828
#ifndef HAVE_KERNEL_8
2929
#include <altivec.h>
30+
31+
#define offset_0 0
32+
#define offset_1 16
33+
#define offset_2 32
34+
#define offset_3 48
35+
36+
37+
3038
static const unsigned char __attribute__((aligned(16))) swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11};
3139
static void cdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, float *dot)
3240
{
3341
__vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
34-
register __vector float *vy = (__vector float *) y;
35-
register __vector float *vx = (__vector float *) x;
36-
BLASLONG i = 0;
42+
register __vector float *vptr_y = (__vector float *) y;
43+
register __vector float *vptr_x = (__vector float *) x;
3744
register __vector float vd_0 = { 0 };
3845
register __vector float vd_1 = { 0 };
3946
register __vector float vd_2 = { 0 };
4047
register __vector float vd_3 = { 0 };
4148
register __vector float vdd_0 = { 0 };
4249
register __vector float vdd_1 = { 0 };
4350
register __vector float vdd_2 = { 0 };
44-
register __vector float vdd_3 = { 0 };
45-
for (; i < n/2; i += 4) {
46-
47-
register __vector float vyy_0 ;
48-
register __vector float vyy_1 ;
49-
register __vector float vyy_2 ;
50-
register __vector float vyy_3 ;
51-
52-
register __vector float vy_0 = vy[i];
53-
register __vector float vy_1 = vy[i + 1];
54-
register __vector float vy_2 = vy[i + 2];
55-
register __vector float vy_3 = vy[i + 3];
56-
register __vector float vx_0= vx[i];
57-
register __vector float vx_1 = vx[i + 1];
58-
register __vector float vx_2 = vx[i + 2];
59-
register __vector float vx_3 = vx[i + 3];
60-
vyy_0 = vec_perm(vy_0, vy_0, swap_mask);
61-
vyy_1 = vec_perm(vy_1, vy_1, swap_mask);
62-
vyy_2 = vec_perm(vy_2, vy_2, swap_mask);
63-
vyy_3 = vec_perm(vy_3, vy_3, swap_mask);
51+
register __vector float vdd_3 = { 0 };
52+
BLASLONG i=0;
53+
for(;i<n/2;i+=4){
54+
55+
register __vector float vy_0 = vec_vsx_ld( offset_0 ,vptr_y ) ;
56+
register __vector float vy_1 = vec_vsx_ld( offset_1 ,vptr_y ) ;
57+
register __vector float vy_2 = vec_vsx_ld( offset_2 ,vptr_y ) ;
58+
register __vector float vy_3 = vec_vsx_ld( offset_3 ,vptr_y ) ;
59+
60+
register __vector float vx_0 = vec_vsx_ld( offset_0 ,vptr_x ) ;
61+
register __vector float vx_1 = vec_vsx_ld( offset_1 ,vptr_x ) ;
62+
register __vector float vyy_0 = vec_perm(vy_0, vy_0, swap_mask);
63+
register __vector float vyy_1 = vec_perm(vy_1, vy_1, swap_mask);
64+
register __vector float vx_2 = vec_vsx_ld( offset_2 ,vptr_x ) ;
65+
register __vector float vx_3 = vec_vsx_ld( offset_3 ,vptr_x ) ;
66+
register __vector float vyy_2 = vec_perm(vy_2, vy_2, swap_mask);
67+
register __vector float vyy_3 = vec_perm(vy_3, vy_3, swap_mask);
6468

6569
vd_0 += vx_0 * vy_0;
6670
vd_1 += vx_1 * vy_1;
@@ -72,6 +76,8 @@ static void cdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, float *dot)
7276
vdd_2 += vx_2 * vyy_2;
7377
vdd_3 += vx_3 * vyy_3;
7478

79+
vptr_x+=4;
80+
vptr_y+=4;
7581

7682
}
7783
//aggregate

0 commit comments

Comments
 (0)