Skip to content

Commit 9efc3f0

Browse files
authored
Merge pull request #109 from xianyi/develop
rebase
2 parents 680f744 + aa21cb5 commit 9efc3f0

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

95 files changed

+6341
-4145
lines changed

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
66
project(OpenBLAS C ASM)
77
set(OpenBLAS_MAJOR_VERSION 0)
88
set(OpenBLAS_MINOR_VERSION 3)
9-
set(OpenBLAS_PATCH_VERSION 11.dev)
9+
set(OpenBLAS_PATCH_VERSION 12.dev)
1010
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
1111

1212
# Adhere to GNU filesystem layout conventions

Changelog.txt

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,36 @@
11
OpenBLAS ChangeLog
2+
====================================================================
3+
Version 0.3.12
4+
24-Oct-2020
5+
6+
common:
7+
* Fixed missing BLAS/LAPACK functions (inadvertently dropped during
8+
the build system restructuring)
9+
* Fixed argument conversion macro in LAPACKE_zgesvdq (LAPACK #458)
10+
11+
POWER:
12+
* Added optimized SCOPY/CCOPY kernels for POWER10
13+
* Increased and unified the default size of the GEMM BUFFER
14+
* Fixed building for POWER10 in DYNAMIC_ARCH mode
15+
* POWER10 compatibility test now checks binutils version as well
16+
* Cleaned up compiler warnings
17+
18+
x86_64:
19+
* corrected compiler version checks for AVX2 compatibility
20+
* added compiler option -mavx2 for building with flang
21+
* fixed direct SGEMM pathway for small matrix sizes (broken by
22+
the code refactoring in 0.3.11)
23+
* fixed unhandled partial register clobbers in several kernels
24+
for AXPY,DOT,GEMV_N and GEMV_T flagged by gcc10 tree-vectorizer
25+
26+
ARMV8:
27+
* improved Apple Vortex support to include cross-compiling
28+
229
====================================================================
330
Version 0.3.11
431
17-Oct-2020
532

6-
common:
33+
common:
734
* API change:
835
the newly added BFLOAT16 functions were renamed to use the
936
letter "B" instead of "H" to avoid potential confusion with
@@ -28,7 +55,7 @@ Version 0.3.11
2855
* Makefile builds no longer misread NO_CBLAS=0 or NO_LAPACK=0 as
2956
enabling these options
3057
* Fixed detection of gfortran when invoked through an mpi wrapper
31-
* Improve thread reinitialization performance with OpenMP xafter a fork
58+
* Improve thread reinitialization performance with OpenMP after a fork
3259
* Added support for building only the subset of the library required
3360
for a particular precision by specifying BUILD_SINGLE, BUILD_DOUBLE
3461
* Optional function name prefixes and suffixes are now correctly
@@ -66,7 +93,6 @@ ARMV8:
6693
* Fixed cpu detection on BSD-like systems
6794
* Fixed compilation in -std=C18 mode
6895

69-
7096
IBM Z:
7197
* Added support for compiling with the clang compiler
7298
* Improved GEMM performance on Z14

Makefile.arm

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,3 +12,8 @@ ifeq ($(CORE), ARMV6)
1212
CCOMMON_OPT += -mfpu=vfp
1313
FCOMMON_OPT += -mfpu=vfp
1414
endif
15+
16+
ifdef HAVE_NEON
17+
CCOMMON_OPT += -mfpu=neon
18+
FCOMMON_OPT += -mfpu=neon
19+
endif

Makefile.rule

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
#
44

55
# This library's version
6-
VERSION = 0.3.11.dev
6+
VERSION = 0.3.12.dev
77

88
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
99
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
@@ -295,10 +295,13 @@ COMMON_PROF = -pg
295295

296296

297297

298-
# the below is not yet configurable, use cmake if you need to build only select types
299-
BUILD_SINGLE = 1
300-
BUILD_DOUBLE = 1
301-
BUILD_COMPLEX = 1
302-
BUILD_COMPLEX16 = 1
298+
# By default the library contains BLAS functions (and LAPACK if selected) for all input types.
299+
# To build a smaller library supporting e.g. only single precision real (SGEMM etc.) or only
300+
# the functions for complex numbers, uncomment the desired type(s) below
301+
# BUILD_SINGLE = 1
302+
# BUILD_DOUBLE = 1
303+
# BUILD_COMPLEX = 1
304+
# BUILD_COMPLEX16 = 1
305+
#
303306
# End of user configuration
304307
#

Makefile.system

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -319,6 +319,7 @@ ifeq ($(GCCVERSIONGTEQ7),1)
319319
else
320320
GCCDUMPVERSION_PARAM := -dumpversion
321321
endif
322+
GCCMINORVERSIONGTEQ1 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 1)
322323
GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 2)
323324
GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 7)
324325
endif
@@ -855,7 +856,7 @@ CCOMMON_OPT += -DF_INTERFACE_FLANG
855856
FCOMMON_OPT += -Mrecursive -Kieee
856857
ifeq ($(OSNAME), Linux)
857858
ifeq ($(ARCH), x86_64)
858-
FLANG_VENDOR := $(shell expr `$(FC) --version|cut -f 1 -d "."|head -1`)
859+
FLANG_VENDOR := $(shell `$(FC) --version|cut -f 1 -d "."|head -1`)
859860
ifeq ($(FLANG_VENDOR),AOCC)
860861
FCOMMON_OPT += -fno-unroll-loops
861862
endif

Makefile.x86_64

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,6 @@ ifndef DYNAMIC_ARCH
4747
ifndef NO_AVX512
4848
ifeq ($(C_COMPILER), GCC)
4949
# cooperlake support was added in 10.1
50-
GCCVERSIONGTEQ10 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 10)
51-
GCCMINORVERSIONGTEQ1 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 1)
5250
ifeq ($(GCCVERSIONGTEQ10)$(GCCMINORVERSIONGTEQ1), 11)
5351
CCOMMON_OPT += -march=cooperlake
5452
FCOMMON_OPT += -march=cooperlake
@@ -73,10 +71,7 @@ ifndef DYNAMIC_ARCH
7371
ifndef NO_AVX2
7472
ifeq ($(C_COMPILER), GCC)
7573
# AVX2 support was added in 4.7.0
76-
GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
77-
GCCVERSIONGTEQ5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 5)
78-
GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7)
79-
GCCVERSIONCHECK := $(GCCVERSIONGTEQ5)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7)
74+
GCCVERSIONCHECK := $(GCCVERSIONGT4)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7)
8075
ifeq ($(GCCVERSIONCHECK), $(filter $(GCCVERSIONCHECK), 011 110 111))
8176
CCOMMON_OPT += -mavx2
8277
endif

benchmark/amax.c

Lines changed: 56 additions & 114 deletions
Original file line numberDiff line numberDiff line change
@@ -25,125 +25,73 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
2525
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2626
*****************************************************************************/
2727

28-
#include <stdio.h>
29-
#include <stdlib.h>
30-
#ifdef __CYGWIN32__
31-
#include <sys/time.h>
32-
#endif
33-
#include "common.h"
34-
28+
#include "bench.h"
3529

3630
#undef AMAX
3731

3832
#ifdef COMPLEX
3933
#ifdef DOUBLE
40-
#define AMAX BLASFUNC(dzamax)
34+
#define AMAX BLASFUNC(dzamax)
4135
#else
42-
#define AMAX BLASFUNC(scamax)
36+
#define AMAX BLASFUNC(scamax)
4337
#endif
4438
#else
4539
#ifdef DOUBLE
46-
#define AMAX BLASFUNC(damax)
40+
#define AMAX BLASFUNC(damax)
4741
#else
48-
#define AMAX BLASFUNC(samax)
42+
#define AMAX BLASFUNC(samax)
4943
#endif
5044
#endif
5145

52-
#if defined(__WIN32__) || defined(__WIN64__)
53-
54-
#ifndef DELTA_EPOCH_IN_MICROSECS
55-
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
56-
#endif
57-
58-
int gettimeofday(struct timeval *tv, void *tz){
59-
60-
FILETIME ft;
61-
unsigned __int64 tmpres = 0;
62-
static int tzflag;
63-
64-
if (NULL != tv)
65-
{
66-
GetSystemTimeAsFileTime(&ft);
67-
68-
tmpres |= ft.dwHighDateTime;
69-
tmpres <<= 32;
70-
tmpres |= ft.dwLowDateTime;
71-
72-
/*converting file time to unix epoch*/
73-
tmpres /= 10; /*convert into microseconds*/
74-
tmpres -= DELTA_EPOCH_IN_MICROSECS;
75-
tv->tv_sec = (long)(tmpres / 1000000UL);
76-
tv->tv_usec = (long)(tmpres % 1000000UL);
77-
}
78-
79-
return 0;
80-
}
81-
82-
#endif
83-
84-
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
85-
86-
static void *huge_malloc(BLASLONG size){
87-
int shmid;
88-
void *address;
89-
90-
#ifndef SHM_HUGETLB
91-
#define SHM_HUGETLB 04000
92-
#endif
93-
94-
if ((shmid =shmget(IPC_PRIVATE,
95-
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
96-
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
97-
printf( "Memory allocation failed(shmget).\n");
98-
exit(1);
99-
}
100-
101-
address = shmat(shmid, NULL, SHM_RND);
102-
103-
if ((BLASLONG)address == -1){
104-
printf( "Memory allocation failed(shmat).\n");
105-
exit(1);
106-
}
107-
108-
shmctl(shmid, IPC_RMID, 0);
109-
110-
return address;
111-
}
112-
113-
#define malloc huge_malloc
114-
115-
#endif
116-
117-
int main(int argc, char *argv[]){
46+
int main(int argc, char *argv[])
47+
{
11848

11949
FLOAT *x;
12050
blasint m, i;
121-
blasint inc_x=1;
51+
blasint inc_x = 1;
12252
int loops = 1;
12353
int l;
12454
char *p;
12555

56+
int from = 1;
57+
int to = 200;
58+
int step = 1;
12659

127-
int from = 1;
128-
int to = 200;
129-
int step = 1;
130-
131-
struct timeval start, stop;
132-
double time1,timeg;
60+
double time1, timeg;
13361

134-
argc--;argv++;
62+
argc--;
63+
argv++;
13564

136-
if (argc > 0) { from = atol(*argv); argc--; argv++;}
137-
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
138-
if (argc > 0) { step = atol(*argv); argc--; argv++;}
65+
if (argc > 0)
66+
{
67+
from = atol(*argv);
68+
argc--;
69+
argv++;
70+
}
71+
if (argc > 0)
72+
{
73+
to = MAX(atol(*argv), from);
74+
argc--;
75+
argv++;
76+
}
77+
if (argc > 0)
78+
{
79+
step = atol(*argv);
80+
argc--;
81+
argv++;
82+
}
13983

140-
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
141-
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
84+
if ((p = getenv("OPENBLAS_LOOPS")))
85+
loops = atoi(p);
86+
if ((p = getenv("OPENBLAS_INCX")))
87+
inc_x = atoi(p);
14288

143-
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
89+
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step, inc_x, loops);
14490

145-
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
146-
fprintf(stderr,"Out of Memory!!\n");exit(1);
91+
if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL)
92+
{
93+
fprintf(stderr, "Out of Memory!!\n");
94+
exit(1);
14795
}
14896

14997
#ifdef __linux
@@ -152,37 +100,31 @@ int main(int argc, char *argv[]){
152100

153101
fprintf(stderr, " SIZE Flops\n");
154102

155-
for(m = from; m <= to; m += step)
103+
for (m = from; m <= to; m += step)
156104
{
157105

158-
timeg=0;
159-
160-
fprintf(stderr, " %6d : ", (int)m);
106+
timeg = 0;
107+
fprintf(stderr, " %6d : ", (int)m);
161108

109+
for (l = 0; l < loops; l++)
110+
{
162111

163-
for (l=0; l<loops; l++)
164-
{
165-
166-
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
167-
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
168-
}
169-
170-
gettimeofday( &start, (struct timezone *)0);
171-
AMAX (&m, x, &inc_x);
172-
gettimeofday( &stop, (struct timezone *)0);
173-
174-
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
175-
176-
timeg += time1;
112+
for (i = 0; i < m * COMPSIZE * abs(inc_x); i++)
113+
{
114+
x[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5;
115+
}
177116

117+
begin();
118+
AMAX(&m, x, &inc_x);
119+
end();
120+
timeg += getsec();
178121
}
179122

180123
timeg /= loops;
181124

182125
fprintf(stderr,
183-
" %10.2f MFlops %10.6f sec\n",
184-
COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
185-
126+
" %10.2f MFlops %10.6f sec\n",
127+
COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
186128
}
187129

188130
return 0;

0 commit comments

Comments
 (0)