Skip to content

Commit 4889114

Browse files
authored
Merge pull request #3488 from xianyi/develop
Update from develop branch for 0.3.19 release
2 parents 6025dac + 54a0c0b commit 4889114

File tree

163 files changed

+24400
-998
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

163 files changed

+24400
-998
lines changed

CMakeLists.txt

Lines changed: 218 additions & 125 deletions
Large diffs are not rendered by default.

CONTRIBUTORS.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -197,3 +197,7 @@ In chronological order:
197197

198198
* River Dillon <oss@outerpassage.net>
199199
* [2021-07-10] fix compilation with musl libc
200+
201+
* Bine Brank <https://github.com/binebrank>
202+
* [2021-10-27] Add vector-length-agnostic DGEMM kernels for Arm SVE
203+
* [2021-11-20] Vector-length-agnostic Arm SVE copy routines for DGEMM, DTRMM, DSYMM

Changelog.txt

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,51 @@
11
OpenBLAS ChangeLog
2+
====================================================================
3+
Version 0.3.19
4+
19-Dec-2021
5+
6+
general:
7+
- reverted unsafe TRSV/ZRSV optimizations introduced in 0.3.16
8+
- fixed a potential thread race in the thread buffer reallocation routines
9+
that were introduced in 0.3.18
10+
- fixed miscounting of thread pool size on Linux with OMP_PROC_BIND=TRUE
11+
- fixed CBLAS interfaces for CSROT/ZSROT and CROTG/ZROTG
12+
- made automatic library suffix for CMAKE builds with INTERFACE64 available
13+
to CBLAS-only builds
14+
15+
x86_64:
16+
- DYNAMIC_ARCH builds now fall back to the cpu with most similar capabilities
17+
when an unknown CPUID is encountered, instead of defaulting to Prescott
18+
- added cpu detection for Intel Alder Lake
19+
- added cpu detection for Intel Sapphire Rapids
20+
- added an optimized SBGEMM kernel for Sapphire Rapids
21+
- fixed DYNAMIC_ARCH builds on OSX with CMAKE
22+
- worked around DYNAMIC_ARCH builds made on Sandybridge failing on SkylakeX
23+
- fixed missing thread initialization for static builds on Windows/MSVC
24+
- fixed an excessive read in ZSYMV
25+
26+
POWER:
27+
- added support for POWER10 in big-endian mode
28+
- added support for building with CMAKE
29+
- added optimized SGEMM and DGEMM kernels for small matrix sizes
30+
31+
ARMV8:
32+
- added basic support and cputype detection for Fujitsu A64FX
33+
- added a generic ARMV8SVE target
34+
- added SVE-enabled SGEMM and DGEMM kernels for ARMV8SVE and A64FX
35+
- added optimized CGEMM and ZGEMM kernels for Cortex A53 and A55 cpus
36+
- fixed cpuid detection for Apple M1 and improved performance
37+
- improved compiler flag setting in CMAKE builds
38+
39+
RISCV64:
40+
- fixed improper initialization in CSCAL/ZSCAL for strided access patterns
41+
42+
MIPS:
43+
- added a GENERIC target for MIPS32
44+
- added support for cross-compiling to MIPS32 on x86_64 using CMAKE
45+
46+
MIPS64:
47+
- fixed misdetection of MSA capability
48+
249
====================================================================
350
Version 0.3.18
451
02-Oct-2021

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ export NOFORTRAN
3232
export NO_LAPACK
3333
endif
3434

35-
LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS))
35+
LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast -O -Og -Os,$(LAPACK_FFLAGS))
3636

3737
SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test
3838

Makefile.arm64

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
ifneq ($(C_COMPILER), PGI)
22

3-
ifneq ($(GCCVERSIONGT4), 1)
3+
ifeq ($(C_COMPILER), CLANG)
4+
ISCLANG=1
5+
endif
6+
ifneq (1, $(filter 1,$(GCCVERSIONGT4) $(ISCLANG)))
47
CCOMMON_OPT += -march=armv8-a
58
ifneq ($(F_COMPILER), NAG)
69
FCOMMON_OPT += -march=armv8-a
@@ -17,6 +20,13 @@ FCOMMON_OPT += -march=armv8-a
1720
endif
1821
endif
1922

23+
ifeq ($(CORE), ARMV8SVE)
24+
CCOMMON_OPT += -march=armv8-a+sve
25+
ifneq ($(F_COMPILER), NAG)
26+
FCOMMON_OPT += -march=armv8-a+sve
27+
endif
28+
endif
29+
2030
ifeq ($(CORE), CORTEXA53)
2131
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a53
2232
ifneq ($(F_COMPILER), NAG)
@@ -48,7 +58,7 @@ endif
4858
# Use a72 tunings because Neoverse-N1 is only available
4959
# in GCC>=9
5060
ifeq ($(CORE), NEOVERSEN1)
51-
ifeq ($(GCCVERSIONGTEQ7), 1)
61+
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
5262
ifeq ($(GCCVERSIONGTEQ9), 1)
5363
CCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1
5464
ifneq ($(F_COMPILER), NAG)
@@ -70,7 +80,7 @@ endif
7080

7181
# Use a53 tunings because a55 is only available in GCC>=8.1
7282
ifeq ($(CORE), CORTEXA55)
73-
ifeq ($(GCCVERSIONGTEQ7), 1)
83+
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
7484
ifeq ($(GCCVERSIONGTEQ8), 1)
7585
CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a55
7686
ifneq ($(F_COMPILER), NAG)
@@ -132,7 +142,7 @@ FCOMMON_OPT += -march=armv8.3-a
132142
endif
133143
endif
134144

135-
ifeq ($(GCCVERSIONGTEQ9), 1)
145+
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ9) $(ISCLANG)))
136146
ifeq ($(CORE), TSV110)
137147
CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
138148
ifneq ($(F_COMPILER), NAG)
@@ -150,6 +160,15 @@ endif
150160
endif
151161
endif
152162

163+
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(ISCLANG)))
164+
ifeq ($(CORE), A64FX)
165+
CCOMMON_OPT += -march=armv8.2-a+sve -mtune=a64fx
166+
ifneq ($(F_COMPILER), NAG)
167+
FCOMMON_OPT += -march=armv8.2-a+sve -mtune=a64fx
168+
endif
169+
endif
153170
endif
154171

155-
endif
172+
endif
173+
174+
endif

Makefile.rule

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
#
44

55
# This library's version
6-
VERSION = 0.3.18
6+
VERSION = 0.3.18.dev
77

88
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
99
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library

Makefile.system

Lines changed: 48 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,10 @@ ifndef TOPDIR
99
TOPDIR = .
1010
endif
1111

12-
# If ARCH is not set, we use the host system's architecture for getarch compile options.
13-
ifndef ARCH
12+
# we need to use the host system's architecture for getarch compile options even especially when cross-compiling
1413
HOSTARCH := $(shell uname -m)
15-
else
16-
HOSTARCH = $(ARCH)
14+
ifeq ($(HOSTARCH), amd64)
15+
HOSTARCH=x86_64
1716
endif
1817

1918
# Catch conflicting usage of ARCH in some BSD environments
@@ -102,7 +101,7 @@ GETARCH_FLAGS += -DUSER_TARGET
102101
ifeq ($(TARGET), GENERIC)
103102
ifeq ($(DYNAMIC_ARCH), 1)
104103
override NO_EXPRECISION=1
105-
export NO_EXPRECiSION
104+
export NO_EXPRECISION
106105
endif
107106
endif
108107
endif
@@ -119,6 +118,9 @@ endif
119118
ifeq ($(TARGET), COOPERLAKE)
120119
GETARCH_FLAGS := -DFORCE_NEHALEM
121120
endif
121+
ifeq ($(TARGET), SAPPHIRERAPIDS)
122+
GETARCH_FLAGS := -DFORCE_NEHALEM
123+
endif
122124
ifeq ($(TARGET), SANDYBRIDGE)
123125
GETARCH_FLAGS := -DFORCE_NEHALEM
124126
endif
@@ -143,8 +145,13 @@ endif
143145
ifeq ($(TARGET), POWER8)
144146
GETARCH_FLAGS := -DFORCE_POWER6
145147
endif
148+
ifeq ($(TARGET), POWER9)
149+
GETARCH_FLAGS := -DFORCE_POWER6
150+
endif
151+
ifeq ($(TARGET), POWER10)
152+
GETARCH_FLAGS := -DFORCE_POWER6
153+
endif
146154
endif
147-
148155

149156
#TARGET_CORE will override TARGET which is used in DYNAMIC_ARCH=1.
150157
#
@@ -164,6 +171,9 @@ endif
164171
ifeq ($(TARGET_CORE), COOPERLAKE)
165172
GETARCH_FLAGS := -DFORCE_NEHALEM
166173
endif
174+
ifeq ($(TARGET_CORE), SAPPHIRERAPIDS)
175+
GETARCH_FLAGS := -DFORCE_NEHALEM
176+
endif
167177
ifeq ($(TARGET_CORE), SANDYBRIDGE)
168178
GETARCH_FLAGS := -DFORCE_NEHALEM
169179
endif
@@ -251,6 +261,8 @@ endif
251261
#For small matrix optimization
252262
ifeq ($(ARCH), x86_64)
253263
SMALL_MATRIX_OPT = 1
264+
else ifeq ($(CORE), POWER10)
265+
SMALL_MATRIX_OPT = 1
254266
endif
255267
ifeq ($(SMALL_MATRIX_OPT), 1)
256268
CCOMMON_OPT += -DSMALL_MATRIX_OPT
@@ -260,6 +272,10 @@ endif
260272
ifndef GOTOBLAS_MAKEFILE
261273
export GOTOBLAS_MAKEFILE = 1
262274

275+
# Determine if the assembler is GNU Assembler
276+
HAVE_GAS := $(shell $(AS) -v < /dev/null 2>&1 | grep GNU 2>&1 >/dev/null ; echo $$?)
277+
GETARCH_FLAGS += -DHAVE_GAS=$(HAVE_GAS)
278+
263279
# Generating Makefile.conf and config.h
264280
DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" HOST_CFLAGS="$(GETARCH_FLAGS)" CFLAGS="$(CFLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) TARGET=$(TARGET) all)
265281

@@ -307,7 +323,7 @@ else
307323
SMP = 1
308324
endif
309325
else
310-
ifeq ($(NUM_THREAD), 1)
326+
ifeq ($(NUM_THREADS), 1)
311327
SMP =
312328
else
313329
SMP = 1
@@ -892,15 +908,25 @@ endif
892908

893909
ifeq ($(C_COMPILER), PGI)
894910
PGCVERSIONGT20 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` \> 20)
895-
PGCVERSIONGTEQ20 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` \>= 20)
896-
PGCMINORVERSIONGE11 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -c 4-5` == 11)
911+
PGCVERSIONEQ20 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` == 20)
912+
PGCMINORVERSIONGE11 := $(shell expr `$(CC) --version|sed -n "2p" |cut -d "-" -f 1 |sed -e "s/[^0-9.]//g" |cut -c 4-5` \>= 11)
897913
PGCVERSIONCHECK := $(PGCVERSIONGT20)$(PGCVERSIONEQ20)$(PGCMINORVERSIONGE11)
898-
ifeq ($(PGCVERSIONCHECK), $(filter $(PGCVERSIONCHECK), 110 111 011))
914+
ifeq ($(PGCVERSIONCHECK), $(filter $(PGCVERSIONCHECK), 100 101 011))
899915
NEWPGI := 1
916+
PGCVERSIONGT21 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` \> 21)
917+
PGCVERSIONEQ21 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` == 21)
918+
PGCVERSIONCHECK2 := $(PGCVERSIONGT21)$(PGCVERSIONEQ21)$(PGCMINORVERSIONGE11)
919+
ifeq ($(PGCVERSIONCHECK2), $(filter $(PGCVERSIONCHECK2), 100 101 011))
920+
NEWPGI2 := 1
921+
endif
900922
endif
901923
ifdef BINARY64
902924
ifeq ($(ARCH), x86_64)
925+
ifneq ($(NEWPGI2),1)
903926
CCOMMON_OPT += -tp p7-64
927+
else
928+
CCOMMON_OPT += -tp px
929+
endif
904930
ifneq ($(NEWPGI),1)
905931
CCOMMON_OPT += -D__MMX__ -Mnollvm
906932
endif
@@ -915,7 +941,11 @@ endif
915941
endif
916942
endif
917943
else
944+
ifneq ($(NEWPGI2),1)
918945
CCOMMON_OPT += -tp p7
946+
else
947+
CCOMMON_OPT += -tp px
948+
endif
919949
endif
920950
endif
921951

@@ -1092,8 +1122,12 @@ FCOMMON_OPT += -i8
10921122
endif
10931123
endif
10941124
ifeq ($(ARCH), x86_64)
1125+
ifneq ($(NEWPGI2),1)
10951126
FCOMMON_OPT += -tp p7-64
10961127
else
1128+
FCOMMON_OPT += -tp px
1129+
endif
1130+
else
10971131
ifeq ($(ARCH), power)
10981132
ifeq ($(CORE), POWER6)
10991133
$(warning NVIDIA HPC compilers do not support POWER6.)
@@ -1643,8 +1677,10 @@ export HAVE_VFP
16431677
export HAVE_VFPV3
16441678
export HAVE_VFPV4
16451679
export HAVE_NEON
1646-
export HAVE_MSA
1647-
export MSA_FLAGS
1680+
ifndef NO_MSA
1681+
export HAVE_MSA
1682+
export MSA_FLAGS
1683+
endif
16481684
export KERNELDIR
16491685
export FUNCTION_PROFILE
16501686
export TARGET_CORE

Makefile.x86_64

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,40 @@ CCOMMON_OPT += -march=cooperlake
8181
ifneq ($(F_COMPILER), NAG)
8282
FCOMMON_OPT += -march=cooperlake
8383
endif
84+
else # gcc not support, fallback to avx512
85+
CCOMMON_OPT += -march=skylake-avx512
86+
ifneq ($(F_COMPILER), NAG)
87+
FCOMMON_OPT += -march=skylake-avx512
88+
endif
89+
endif
90+
endif
91+
ifeq ($(OSNAME), CYGWIN_NT)
92+
CCOMMON_OPT += -fno-asynchronous-unwind-tables
93+
FCOMMON_OPT += -fno-asynchronous-unwind-tables
94+
endif
95+
ifeq ($(OSNAME), WINNT)
96+
ifeq ($(C_COMPILER), GCC)
97+
CCOMMON_OPT += -fno-asynchronous-unwind-tables
98+
FCOMMON_OPT += -fno-asynchronous-unwind-tables
99+
endif
100+
endif
101+
endif
102+
endif
103+
104+
ifeq ($(CORE), SAPPHIRERAPIDS)
105+
ifndef NO_AVX512
106+
ifeq ($(C_COMPILER), GCC)
107+
# sapphire rapids support was added in 11
108+
ifeq ($(GCCVERSIONGTEQ11), 1)
109+
CCOMMON_OPT += -march=sapphirerapids
110+
ifneq ($(F_COMPILER), NAG)
111+
FCOMMON_OPT += -march=sapphirerapids
112+
endif
113+
else # gcc not support, fallback to avx512
114+
CCOMMON_OPT += -march=skylake-avx512
115+
ifneq ($(F_COMPILER), NAG)
116+
FCOMMON_OPT += -march=skylake-avx512
117+
endif
84118
endif
85119
endif
86120
ifeq ($(OSNAME), CYGWIN_NT)

TargetList.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ HASWELL
2323
SKYLAKEX
2424
ATOM
2525
COOPERLAKE
26+
SAPPHIRERAPIDS
2627

2728
b)AMD CPU:
2829
ATHLON

appveyor.yml

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -29,15 +29,15 @@ environment:
2929
global:
3030
CONDA_INSTALL_LOCN: C:\\Miniconda36-x64
3131
matrix:
32-
- COMPILER: clang-cl
33-
WITH_FORTRAN: ON
34-
- COMPILER: clang-cl
35-
DYNAMIC_ARCH: ON
36-
WITH_FORTRAN: OFF
37-
- COMPILER: cl
38-
- COMPILER: MinGW64-gcc-7.2.0-mingw
39-
DYNAMIC_ARCH: OFF
40-
WITH_FORTRAN: ignore
32+
# - COMPILER: clang-cl
33+
# WITH_FORTRAN: ON
34+
# - COMPILER: clang-cl
35+
# DYNAMIC_ARCH: ON
36+
# WITH_FORTRAN: OFF
37+
# - COMPILER: cl
38+
# - COMPILER: MinGW64-gcc-7.2.0-mingw
39+
# DYNAMIC_ARCH: OFF
40+
# WITH_FORTRAN: ignore
4141
- APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015
4242
COMPILER: MinGW-gcc-6.3.0-32
4343
- APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015
@@ -46,6 +46,7 @@ environment:
4646

4747
install:
4848
- if [%COMPILER%]==[clang-cl] call %CONDA_INSTALL_LOCN%\Scripts\activate.bat
49+
- if [%COMPILER%]==[clang-cl] conda update --yes -n base conda
4950
- if [%COMPILER%]==[clang-cl] conda config --add channels conda-forge --force
5051
- if [%COMPILER%]==[clang-cl] conda config --set auto_update_conda false
5152
- if [%COMPILER%]==[clang-cl] conda install --yes --quiet clangdev cmake ninja flang=11.0.1
@@ -64,8 +65,8 @@ before_build:
6465
- if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] cmake -G "MinGW Makefiles" -DNOFORTRAN=1 ..
6566
- if [%COMPILER%]==[MinGW-gcc-6.3.0-32] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 ..
6667
- if [%COMPILER%]==[MinGW-gcc-5.3.0] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 ..
67-
- if [%WITH_FORTRAN%]==[OFF] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON ..
68-
- if [%WITH_FORTRAN%]==[ON] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 ..
68+
- if [%WITH_FORTRAN%]==[OFF] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_MT=mt -DMSVC_STATIC_CRT=ON ..
69+
- if [%WITH_FORTRAN%]==[ON] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DCMAKE_MT=mt -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 ..
6970
- if [%USE_OPENMP%]==[ON] cmake -DUSE_OPENMP=ON ..
7071
- if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON -DDYNAMIC_LIST='CORE2;NEHALEM;SANDYBRIDGE;BULLDOZER;HASWELL' ..
7172

0 commit comments

Comments
 (0)