Skip to content

Commit 8fe7f80

Browse files
authored
Merge pull request #4408 from OpenMathLib/develop
merge develop for 0.3.26 release
2 parents 5e1a429 + cddd35f commit 8fe7f80

File tree

411 files changed

+76859
-5484
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

411 files changed

+76859
-5484
lines changed

.cirrus.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ task:
2929
- mkdir build
3030
- cd build
3131
- cmake -DTARGET=VORTEX -DCMAKE_C_COMPILER=clang -DBUILD_SHARED_LIBS=ON ..
32-
- make
32+
- make -j 4
3333

3434
task:
3535
name: AppleM1/GCC/MAKE/OPENMP

.github/workflows/loongarch64.yml

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,13 @@ jobs:
1616
include:
1717
- target: LOONGSONGENERIC
1818
triple: loongarch64-unknown-linux-gnu
19-
opts: NO_SHARED=1 TARGET=LOONGSONGENERIC
19+
opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSONGENERIC
2020
- target: LOONGSON3R5
2121
triple: loongarch64-unknown-linux-gnu
22-
opts: NO_SHARED=1 TARGET=LOONGSON3R5
22+
opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSON3R5
2323
- target: LOONGSON2K1000
2424
triple: loongarch64-unknown-linux-gnu
25-
opts: NO_SHARED=1 TARGET=LOONGSON2K1000
25+
opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSON2K1000
2626
- target: DYNAMIC_ARCH
2727
triple: loongarch64-unknown-linux-gnu
2828
opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=GENERIC
@@ -40,8 +40,9 @@ jobs:
4040
4141
- name: Download and install loongarch64-toolchain
4242
run: |
43-
wget https://github.com/loongson/build-tools/releases/download/2022.09.06/loongarch64-clfs-7.3-cross-tools-gcc-glibc.tar.xz
44-
tar -xf loongarch64-clfs-7.3-cross-tools-gcc-glibc.tar.xz -C /opt
43+
wget https://github.com/sunhaiyong1978/CLFS-for-LoongArch/releases/download/8.1/CLFS-loongarch64-8.1-x86_64-cross-tools-gcc-glibc.tar.xz
44+
#wget https://github.com/loongson/build-tools/releases/download/2023.08.08/CLFS-loongarch64-8.1-x86_64-cross-tools-gcc-glibc.tar.xz
45+
tar -xf CLFS-loongarch64-8.1-x86_64-cross-tools-gcc-glibc.tar.xz -C /opt
4546
4647
- name: Set env
4748
run: |

CMakeLists.txt

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ project(OpenBLAS C ASM)
88

99
set(OpenBLAS_MAJOR_VERSION 0)
1010
set(OpenBLAS_MINOR_VERSION 3)
11-
set(OpenBLAS_PATCH_VERSION 25)
11+
set(OpenBLAS_PATCH_VERSION 25.dev)
1212

1313
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
1414

@@ -249,20 +249,21 @@ if (${CMAKE_SYSTEM_NAME} MATCHES "AIX|Android|Linux|FreeBSD|OpenBSD|NetBSD|Drago
249249
endif()
250250
endif()
251251

252-
if (APPLE AND DYNAMIC_ARCH AND BUILD_SHARED_LIBS)
252+
# Seems that this hack doesn't required since macOS 11 Big Sur
253+
if (APPLE AND BUILD_SHARED_LIBS AND CMAKE_HOST_SYSTEM_VERSION VERSION_LESS 20)
253254
set (CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1)
254255
if (NOT NOFORTRAN)
255256
set (CMAKE_Fortran_USE_RESPONSE_FILE_FOR_OBJECTS 1)
256257
set (CMAKE_Fortran_CREATE_SHARED_LIBRARY
257-
"sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ar -ru libopenblas.a && exit 0' "
258-
"sh -c 'ar -ru libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' "
258+
"sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru libopenblas.a && exit 0' "
259+
"sh -c '${CMAKE_AR} -rs libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' "
259260
"sh -c 'echo \"\" | ${CMAKE_Fortran_COMPILER} -o dummy.o -c -x f95-cpp-input - '"
260261
"sh -c '${CMAKE_Fortran_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load dummy.o -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'"
261262
"sh -c 'ls -l ${CMAKE_BINARY_DIR}/lib'")
262263
else ()
263264
set (CMAKE_C_CREATE_SHARED_LIBRARY
264-
"sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ar -ru libopenblas.a && exit 0' "
265-
"sh -c 'ar -ru libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' "
265+
"sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru libopenblas.a && exit 0' "
266+
"sh -c '${CMAKE_AR} -rs libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' "
266267
"sh -c '${CMAKE_C_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'")
267268
endif ()
268269
endif()
@@ -541,7 +542,7 @@ if(NOT NO_LAPACKE)
541542
ADD_CUSTOM_TARGET(genlapacke
542543
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h.in "${CMAKE_BINARY_DIR}/lapacke_mangling.h"
543544
)
544-
install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64})
545+
install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
545546
endif()
546547

547548
# Install pkg-config files

CONTRIBUTORS.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,3 +216,6 @@ In chronological order:
216216

217217
* Pablo Romero <https://github.com/pablorcum>
218218
* [2022-08] Fix building from sources for QNX
219+
220+
* Mark Seminatore <https://github.com/mseminatore>
221+
* [2023-11-09] Improve Windows threading performance scaling

Changelog.txt

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,49 @@
11
OpenBLAS ChangeLog
2+
====================================================================
3+
Version 0.3.26
4+
2-Jan-2024
5+
6+
general:
7+
- improved the version of openblas.pc that is created by the CMAKE build
8+
- fixed a CMAKE-specific build problem on older versions of MacOS
9+
- worked around linking problems on old versions of MacOS
10+
- corrected installation location of the lapacke_mangling header in CMAKE builds
11+
- added type declarations for complex variables to the MSVC-specific parts of the LAPACK header
12+
- significantly sped up ?GESV for small problem sizes by introducing a lower bound for multithreading
13+
- imported additions and corrections from the Reference-LAPACK project:
14+
- added new LAPACK functions for truncated QR with pivoting (Reference-LAPACK PRs 891&941)
15+
- handle miscalculation of minimum work array size in corner cases (Reference-LAPACK PR 942)
16+
- fixed use of uninitialized variables in ?GEDMD and improved inline documentation (PR 959)
17+
- fixed use of uninitialized variables (and consequential failures) in ?BBCSD (PR 967)
18+
- added tests for the recently introduced Dynamic Mode Decomposition functions (PR 736)
19+
- fixed several memory leaks in the LAPACK testsuite (PR 953)
20+
- fixed counting of testsuite results by the Python script (PR 954)
21+
22+
x86-64:
23+
- fixed computation of CASUM on SkylakeX and newer targets in the special
24+
case that AVX512 is not supported by the compiler or operating environment
25+
- fixed potential undefined behaviour in the CASUM/ZASUM kernels for AVX512 targets
26+
- worked around a problem in the pre-AVX kernels for GEMV
27+
- sped up the thread management code on MS Windows
28+
29+
arm64:
30+
- fixed building of the LAPACK testsuite with Xcode 15 on Apple M1 and newer
31+
- sped up the thread management code on MS Windows
32+
- sped up SGEMM and DGEMM on Neoverse V1 and N1
33+
- sped up ?DOT on SVE-capable targets
34+
- reduced the number of targets in DYNAMIC_ARCH builds by eliminating functionally equivalent ones
35+
- included support for Apple M1 and newer targets in DYNAMIC_ARCH builds
36+
37+
power:
38+
- improved the SGEMM kernel for POWER10
39+
- fixed compilation with (very) old versions of gcc
40+
- fixed detection of old 32bit PPC targets in CMAKE-based builds
41+
- added autodetection of the POWERPC 7400 subtype
42+
- fixed CMAKE-based compilation for PPCG4 and PPC970 targets
43+
44+
loongarch64:
45+
- added and improved optimized kernels for almost all BLAS functions
46+
247
====================================================================
348
Version 0.3.25
449
12-Nov-2023

GotoBLAS_06WeirdPerformance.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
operation is finished.
1212

1313

14-
2. Simlar problem may happen under virtual machine. If supervisor
14+
2. Similar problem may happen under virtual machine. If supervisor
1515
allocates different cores for each scheduling, BLAS performnace
1616
will be bad. This is because BLAS also utilizes all cache,
1717
unexpected re-schedule for different core may result of heavy

Makefile.power

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,19 @@ endif
1111

1212
ifeq ($(CORE), POWER10)
1313
ifneq ($(C_COMPILER), PGI)
14+
ifeq ($(C_COMPILER), GCC)
15+
ifeq ($(GCCVERSIONGTEQ10), 1)
1416
CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
17+
else ifneq ($(GCCVERSIONGT4), 1)
18+
$(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended)
19+
CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math
20+
else
21+
$(warning your compiler is too old to fully support POWER10, getting a newer version of gcc is recommended)
22+
CCOMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -fno-fast-math
23+
endif
24+
else
25+
CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
26+
endif
1527
ifeq ($(F_COMPILER), IBM)
1628
FCOMMON_OPT += -O2 -qrecur -qnosave -qarch=pwr10 -qtune=pwr10 -qfloat=nomaf -qzerosize
1729
else

Makefile.rule

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
#
44

55
# This library's version
6-
VERSION = 0.3.25
6+
VERSION = 0.3.25.dev
77

88
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
99
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library

Makefile.system

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -407,6 +407,7 @@ XCVER = $(shell pkgutil --pkg-info=com.apple.pkg.CLTools_Executables |awk '/vers
407407
endif
408408
ifeq (x$(XCVER), x 15)
409409
CCOMMON_OPT += -Wl,-ld_classic
410+
FCOMMON_OPT += -Wl,-ld_classic
410411
endif
411412
endif
412413

@@ -676,16 +677,12 @@ ifeq ($(ARCH), arm64)
676677
DYNAMIC_CORE = ARMV8
677678
DYNAMIC_CORE += CORTEXA53
678679
DYNAMIC_CORE += CORTEXA57
679-
DYNAMIC_CORE += CORTEXA72
680-
DYNAMIC_CORE += CORTEXA73
681680
DYNAMIC_CORE += NEOVERSEN1
682681
ifneq ($(NO_SVE), 1)
683682
DYNAMIC_CORE += NEOVERSEV1
684683
DYNAMIC_CORE += NEOVERSEN2
685684
DYNAMIC_CORE += ARMV8SVE
686685
endif
687-
DYNAMIC_CORE += CORTEXA55
688-
DYNAMIC_CORE += FALKOR
689686
DYNAMIC_CORE += THUNDERX
690687
DYNAMIC_CORE += THUNDERX2T99
691688
DYNAMIC_CORE += TSV110

README.md

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -196,20 +196,22 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
196196
```sh
197197
make HOSTCC=gcc TARGET=C910V CC=riscv64-unknown-linux-gnu-gcc FC=riscv64-unknown-linux-gnu-gfortran
198198
```
199-
(also known to work on C906)
199+
(also known to work on C906 as long as you use only single-precision functions - its instruction set support appears to be incomplete in double precision)
200200

201201
### Support for multiple targets in a single library
202202

203203
OpenBLAS can be built for multiple targets with runtime detection of the target cpu by specifiying `DYNAMIC_ARCH=1` in Makefile.rule, on the gmake command line or as `-DDYNAMIC_ARCH=TRUE` in cmake.
204204

205-
For **x86_64**, the list of targets this activates contains Prescott, Core2, Nehalem, Barcelona, Sandybridge, Bulldozer, Piledriver, Steamroller, Excavator, Haswell, Zen, SkylakeX. For cpu generations not included in this list, the corresponding older model is used. If you also specify `DYNAMIC_OLDER=1`, specific support for Penryn, Dunnington, Opteron, Opteron/SSE3, Bobcat, Atom and Nano is added. Finally there is an option `DYNAMIC_LIST` that allows to specify an individual list of targets to include instead of the default.
205+
For **x86_64**, the list of targets this activates contains Prescott, Core2, Nehalem, Barcelona, Sandybridge, Bulldozer, Piledriver, Steamroller, Excavator, Haswell, Zen, SkylakeX, Cooper Lake, Sapphire Rapids. For cpu generations not included in this list, the corresponding older model is used. If you also specify `DYNAMIC_OLDER=1`, specific support for Penryn, Dunnington, Opteron, Opteron/SSE3, Bobcat, Atom and Nano is added. Finally there is an option `DYNAMIC_LIST` that allows to specify an individual list of targets to include instead of the default.
206206

207207
`DYNAMIC_ARCH` is also supported on **x86**, where it translates to Katmai, Coppermine, Northwood, Prescott, Banias,
208208
Core2, Penryn, Dunnington, Nehalem, Athlon, Opteron, Opteron_SSE3, Barcelona, Bobcat, Atom and Nano.
209209

210-
On **ARMV8**, it enables support for CortexA53, CortexA57, CortexA72, CortexA73, Falkor, ThunderX, ThunderX2T99, TSV110 as well as generic ARMV8 cpus.
210+
On **ARMV8**, it enables support for CortexA53, CortexA57, CortexA72, CortexA73, Falkor, ThunderX, ThunderX2T99, TSV110 as well as generic ARMV8 cpus. If compiler support for SVE is available at build time, support for NeoverseN2, NeoverseV1 as well as generic ArmV8SVE targets is also enabled.
211211

212-
For **POWER**, the list encompasses POWER6, POWER8 and POWER9, on **ZARCH** it comprises Z13 and Z14.
212+
For **POWER**, the list encompasses POWER6, POWER8 and POWER9. POWER10 is additionally available if a sufficiently recent compiler is used for the build.
213+
214+
on **ZARCH** it comprises Z13 and Z14 as well as generic zarch support.
213215

214216
The `TARGET` option can be used in conjunction with `DYNAMIC_ARCH=1` to specify which cpu model should be assumed for all the
215217
common code in the library, usually you will want to set this to the oldest model you expect to encounter.

0 commit comments

Comments
 (0)