Skip to content

Commit 256fc15

Browse files
authored
Merge pull request #2 from xianyi/develop
update
2 parents 300f158 + ee49852 commit 256fc15

File tree

16 files changed

+269
-176
lines changed

16 files changed

+269
-176
lines changed

.travis.yml

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -25,14 +25,14 @@ matrix:
2525
- TARGET_BOX=LINUX64
2626
- BTYPE="BINARY=64"
2727

28-
- <<: *test-ubuntu
29-
os: linux-ppc64le
30-
before_script:
31-
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32"
32-
env:
33-
# for matrix annotation only
34-
- TARGET_BOX=PPC64LE_LINUX
35-
- BTYPE="BINARY=64 USE_OPENMP=1"
28+
# - <<: *test-ubuntu
29+
# os: linux-ppc64le
30+
# before_script:
31+
# - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32"
32+
# env:
33+
# # for matrix annotation only
34+
# - TARGET_BOX=PPC64LE_LINUX
35+
# - BTYPE="BINARY=64 USE_OPENMP=1"
3636

3737
- <<: *test-ubuntu
3838
env:

Makefile.system

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -699,7 +699,7 @@ endif
699699

700700
ifeq ($(C_COMPILER), PGI)
701701
ifdef BINARY64
702-
CCOMMON_OPT += -tp p7-64
702+
CCOMMON_OPT += -tp p7-64 -D__MMX__ -Mnollvm
703703
else
704704
CCOMMON_OPT += -tp p7
705705
endif

cmake/prebuild.cmake

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,29 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
106106
file(APPEND ${TARGET_CONF_TEMP}
107107
"#define ${TCORE}\n"
108108
"#define CHAR_CORENAME \"${TCORE}\"\n")
109-
if ("${TCORE}" STREQUAL "ARMV7")
109+
if ("${TCORE}" STREQUAL "CORE2")
110+
file(APPEND ${TARGET_CONF_TEMP}
111+
"#define L1_DATA_SIZE\t32768\n"
112+
"#define L1_DATA_LINESIZE\t64\n"
113+
"#define L2_SIZE\t1048576\n"
114+
"#define L2_LINESIZE\t64\n"
115+
"#define DTB_DEFAULT_ENTRIES\t256\n"
116+
"#define DTB_SIZE\t4096\n"
117+
"#define HAVE_CMOV\n"
118+
"#define HAVE_MMX\n"
119+
"#define HAVE_SSE\n"
120+
"#define HAVE_SSE2\n"
121+
"#define HAVE_SSE3\n"
122+
"#define HAVE_SSSE3\n")
123+
set(SGEMM_UNROLL_M 8)
124+
set(SGEMM_UNROLL_N 4)
125+
set(DGEMM_UNROLL_M 4)
126+
set(DGEMM_UNROLL_N 4)
127+
set(CGEMM_DEFAULT_UNROLL_M 4)
128+
set(CGEMM_DEFAULT_UNROLL_N 2)
129+
set(ZGEMM_DEFAULT_UNROLL_M 2)
130+
set(ZGEMM_DEFAULT_UNROLL_N 2)
131+
elseif ("${TCORE}" STREQUAL "ARMV7")
110132
file(APPEND ${TARGET_CONF_TEMP}
111133
"#define L1_DATA_SIZE\t65536\n"
112134
"#define L1_DATA_LINESIZE\t32\n"

kernel/power/caxpy.c

Lines changed: 43 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,21 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
2424
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
2525
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2626
*****************************************************************************/
27-
2827
#include "common.h"
29-
30-
3128
#ifndef HAVE_ASM_KERNEL
3229
#include <altivec.h>
30+
31+
#define offset_0 0
32+
#define offset_1 16
33+
#define offset_2 32
34+
#define offset_3 48
35+
#define offset_4 64
36+
#define offset_5 80
37+
#define offset_6 96
38+
#define offset_7 112
39+
40+
static const unsigned char __attribute__((aligned(16))) swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11};
41+
3342
static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i)
3443
{
3544

@@ -43,28 +52,29 @@ static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT
4352
register __vector float valpha_i = {alpha_i, alpha_i,alpha_i, alpha_i};
4453
#endif
4554

46-
__vector unsigned char swap_mask = { 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11};
47-
register __vector float *vy = (__vector float *) y;
48-
register __vector float *vx = (__vector float *) x;
55+
__vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
56+
register __vector float *vptr_y = (__vector float *) y;
57+
register __vector float *vptr_x = (__vector float *) x;
4958
BLASLONG i=0;
50-
for (; i < n/2; i += 8) {
59+
for(;i<n/2;i+=8){
60+
61+
register __vector float vy_0 = vec_vsx_ld( offset_0 ,vptr_y ) ;
62+
register __vector float vy_1 = vec_vsx_ld( offset_1 ,vptr_y ) ;
63+
register __vector float vy_2 = vec_vsx_ld( offset_2 ,vptr_y ) ;
64+
register __vector float vy_3 = vec_vsx_ld( offset_3 ,vptr_y ) ;
65+
register __vector float vy_4 = vec_vsx_ld( offset_4 ,vptr_y ) ;
66+
register __vector float vy_5 = vec_vsx_ld( offset_5 ,vptr_y ) ;
67+
register __vector float vy_6 = vec_vsx_ld( offset_6 ,vptr_y ) ;
68+
register __vector float vy_7 = vec_vsx_ld( offset_7 ,vptr_y ) ;
5169

52-
register __vector float vy_0 = vy[i];
53-
register __vector float vy_1 = vy[i + 1];
54-
register __vector float vy_2 = vy[i + 2];
55-
register __vector float vy_3 = vy[i + 3];
56-
register __vector float vy_4 = vy[i + 4];
57-
register __vector float vy_5 = vy[i + 5];
58-
register __vector float vy_6 = vy[i + 6];
59-
register __vector float vy_7 = vy[i + 7];
60-
register __vector float vx_0 = vx[i];
61-
register __vector float vx_1 = vx[i + 1];
62-
register __vector float vx_2 = vx[i + 2];
63-
register __vector float vx_3 = vx[i + 3];
64-
register __vector float vx_4 = vx[i + 4];
65-
register __vector float vx_5 = vx[i + 5];
66-
register __vector float vx_6 = vx[i + 6];
67-
register __vector float vx_7 = vx[i + 7];
70+
register __vector float vx_0 = vec_vsx_ld( offset_0 ,vptr_x ) ;
71+
register __vector float vx_1 = vec_vsx_ld( offset_1 ,vptr_x ) ;
72+
register __vector float vx_2 = vec_vsx_ld( offset_2 ,vptr_x ) ;
73+
register __vector float vx_3 = vec_vsx_ld( offset_3 ,vptr_x ) ;
74+
register __vector float vx_4 = vec_vsx_ld( offset_4 ,vptr_x ) ;
75+
register __vector float vx_5 = vec_vsx_ld( offset_5 ,vptr_x ) ;
76+
register __vector float vx_6 = vec_vsx_ld( offset_6 ,vptr_x ) ;
77+
register __vector float vx_7 = vec_vsx_ld( offset_7 ,vptr_x ) ;
6878
vy_0 += vx_0*valpha_r;
6979
vy_1 += vx_1*valpha_r;
7080
vy_2 += vx_2*valpha_r;
@@ -89,15 +99,17 @@ static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT
8999
vy_5 += vx_5*valpha_i;
90100
vy_6 += vx_6*valpha_i;
91101
vy_7 += vx_7*valpha_i;
92-
vy[i] = vy_0;
93-
vy[i + 1] = vy_1;
94-
vy[i + 2] = vy_2;
95-
vy[i + 3] = vy_3;
96-
vy[i + 4] = vy_4;
97-
vy[i + 5] = vy_5 ;
98-
vy[i + 6] = vy_6 ;
99-
vy[i + 7] = vy_7 ;
102+
vec_vsx_st( vy_0, offset_0 ,vptr_y ) ;
103+
vec_vsx_st( vy_1, offset_1 ,vptr_y ) ;
104+
vec_vsx_st( vy_2, offset_2 ,vptr_y ) ;
105+
vec_vsx_st( vy_3, offset_3 ,vptr_y ) ;
106+
vec_vsx_st( vy_4, offset_4 ,vptr_y ) ;
107+
vec_vsx_st( vy_5, offset_5 ,vptr_y ) ;
108+
vec_vsx_st( vy_6, offset_6 ,vptr_y ) ;
109+
vec_vsx_st( vy_7, offset_7 ,vptr_y ) ;
100110

111+
vptr_x+=8;
112+
vptr_y+=8;
101113
}
102114
}
103115
#endif

kernel/power/cdot.c

Lines changed: 32 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -25,42 +25,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2525
*****************************************************************************/
2626

2727
#include "common.h"
28-
2928
#ifndef HAVE_KERNEL_8
3029
#include <altivec.h>
30+
31+
#define offset_0 0
32+
#define offset_1 16
33+
#define offset_2 32
34+
#define offset_3 48
35+
36+
37+
38+
static const unsigned char __attribute__((aligned(16))) swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11};
3139
static void cdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, float *dot)
3240
{
33-
__vector unsigned char swap_mask = { 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11};
34-
register __vector float *vy = (__vector float *) y;
35-
register __vector float *vx = (__vector float *) x;
36-
BLASLONG i = 0;
41+
__vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
42+
register __vector float *vptr_y = (__vector float *) y;
43+
register __vector float *vptr_x = (__vector float *) x;
3744
register __vector float vd_0 = { 0 };
3845
register __vector float vd_1 = { 0 };
3946
register __vector float vd_2 = { 0 };
4047
register __vector float vd_3 = { 0 };
4148
register __vector float vdd_0 = { 0 };
4249
register __vector float vdd_1 = { 0 };
4350
register __vector float vdd_2 = { 0 };
44-
register __vector float vdd_3 = { 0 };
45-
for (; i < n/2; i += 4) {
46-
47-
register __vector float vyy_0 ;
48-
register __vector float vyy_1 ;
49-
register __vector float vyy_2 ;
50-
register __vector float vyy_3 ;
51-
52-
register __vector float vy_0 = vy[i];
53-
register __vector float vy_1 = vy[i + 1];
54-
register __vector float vy_2 = vy[i + 2];
55-
register __vector float vy_3 = vy[i + 3];
56-
register __vector float vx_0= vx[i];
57-
register __vector float vx_1 = vx[i + 1];
58-
register __vector float vx_2 = vx[i + 2];
59-
register __vector float vx_3 = vx[i + 3];
60-
vyy_0 = vec_perm(vy_0, vy_0, swap_mask);
61-
vyy_1 = vec_perm(vy_1, vy_1, swap_mask);
62-
vyy_2 = vec_perm(vy_2, vy_2, swap_mask);
63-
vyy_3 = vec_perm(vy_3, vy_3, swap_mask);
51+
register __vector float vdd_3 = { 0 };
52+
BLASLONG i=0;
53+
for(;i<n/2;i+=4){
54+
55+
register __vector float vy_0 = vec_vsx_ld( offset_0 ,vptr_y ) ;
56+
register __vector float vy_1 = vec_vsx_ld( offset_1 ,vptr_y ) ;
57+
register __vector float vy_2 = vec_vsx_ld( offset_2 ,vptr_y ) ;
58+
register __vector float vy_3 = vec_vsx_ld( offset_3 ,vptr_y ) ;
59+
60+
register __vector float vx_0 = vec_vsx_ld( offset_0 ,vptr_x ) ;
61+
register __vector float vx_1 = vec_vsx_ld( offset_1 ,vptr_x ) ;
62+
register __vector float vyy_0 = vec_perm(vy_0, vy_0, swap_mask);
63+
register __vector float vyy_1 = vec_perm(vy_1, vy_1, swap_mask);
64+
register __vector float vx_2 = vec_vsx_ld( offset_2 ,vptr_x ) ;
65+
register __vector float vx_3 = vec_vsx_ld( offset_3 ,vptr_x ) ;
66+
register __vector float vyy_2 = vec_perm(vy_2, vy_2, swap_mask);
67+
register __vector float vyy_3 = vec_perm(vy_3, vy_3, swap_mask);
6468

6569
vd_0 += vx_0 * vy_0;
6670
vd_1 += vx_1 * vy_1;
@@ -72,6 +76,8 @@ static void cdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, float *dot)
7276
vdd_2 += vx_2 * vyy_2;
7377
vdd_3 += vx_3 * vyy_3;
7478

79+
vptr_x+=4;
80+
vptr_y+=4;
7581

7682
}
7783
//aggregate
@@ -96,7 +102,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
96102
BLASLONG i = 0;
97103
BLASLONG ix=0, iy=0;
98104
OPENBLAS_COMPLEX_FLOAT result;
99-
FLOAT dot[4] __attribute__ ((aligned(16))) = {0.0, 0.0, 0.0, 0.0};
105+
FLOAT dot[4] __attribute__((aligned(16))) = {0.0, 0.0, 0.0, 0.0};
100106

101107
if (n <= 0) {
102108
CREAL(result) = 0.0;

0 commit comments

Comments
 (0)