Skip to content

Commit fa9a30b

Browse files
authored
Merge pull request #19 from xianyi/develop
rebase
2 parents 9e0dbe8 + d90ca75 commit fa9a30b

File tree

4 files changed

+67
-11
lines changed

4 files changed

+67
-11
lines changed

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
66
project(OpenBLAS C ASM)
77
set(OpenBLAS_MAJOR_VERSION 0)
88
set(OpenBLAS_MINOR_VERSION 3)
9-
set(OpenBLAS_PATCH_VERSION 13.dev)
9+
set(OpenBLAS_PATCH_VERSION 14.dev)
1010
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
1111

1212
# Adhere to GNU filesystem layout conventions

Changelog.txt

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,52 @@
11
OpenBLAS ChangeLog
2+
====================================================================
3+
Version 0.3.14
4+
17-Mar-2021
5+
6+
common:
7+
* Fixed a race condition on thread shutdown in non-OpenMP builds
8+
* Fixed custom BUFFERSIZE option getting ignored in gmake builds
9+
* Fixed CMAKE compilation of the TRMM kernels for GENERIC platforms
10+
* Added CBLAS interfaces for CROTG, ZROTG, CSROT and ZDROT
11+
* Improved performance of OMATCOPY_RT across all platforms
12+
* Changed perl scripts to use env instead of a hardcoded /usr/bin/perl
13+
* Fixed potential misreading of the GCC compiler version in the build scripts
14+
* Fixed convergence problems in LAPACK complex GGEV/GGES (Reference-LAPACK #477)
15+
* Reduced the stacksize requirements for running the LAPACK testsuite (Reference-LAPACK #335)
16+
17+
RISCV:
18+
* Fixed compilation on RISCV (missing entry in getarch)
19+
20+
POWER:
21+
* Fixed compilation for DYNAMIC_ARCH with clang and with old gcc versions
22+
* Added support for compilation on FreeBSD/ppc64le
23+
* Added optimized POWER10 kernels for SSCAL, DSCAL, CSCAL, ZSCAL
24+
* Added optimized POWER10 kernels for SROT, DROT, CDOT, SASUM, DASUM
25+
* Improved SSWAP, DSWAP, CSWAP, ZSWAP performance on POWER10
26+
* Improved SCOPY and CCOPY performance on POWER10
27+
* Improved SGEMM and DGEMM performance on POWER10
28+
* Added support for compilation with the NVIDIA HPC compiler
29+
30+
x86_64:
31+
* Added an optimized bfloat16 GEMM kernel for Cooperlake
32+
* Added CPUID autodetection for Intel Rocket Lake and Tiger Lake cpus
33+
* Improved the performance of SASUM,DASUM,SROT,DROT on AMD Ryzen cpus
34+
* Added support for compilation with the NAG Fortran compiler
35+
* Fixed recognition of the AMD AOCC compiler
36+
* Fixed compilation for DYNAMIC_ARCH with clang on Windows
37+
* Added support for running the BLAS/CBLAS tests on Windows
38+
* Fixed signatures of the tls callback functions for Windows x64
39+
* Fixed various issues with fma intrinsics support handling
40+
41+
ARM:
42+
* Added support for embedded Cortex M targets via a new option EMBEDDED
43+
44+
ARMV8:
45+
* Fixed the THUNDERX2T99 and NEOVERSEN1 DNRM2/ZNRM2 kernels for inputs with Inf
46+
* Added support for the DYNAMIC_LIST option
47+
* Added support for compilation with the NVIDIA HPC compiler
48+
* Added support for compiling with the NAG Fortran compiler
49+
250
====================================================================
351
Version 0.3.13
452
12-Dec-2020

Makefile.rule

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
#
44

55
# This library's version
6-
VERSION = 0.3.13.dev
6+
VERSION = 0.3.14.dev
77

88
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
99
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library

param.h

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
7272
#ifndef PARAM_H
7373
#define PARAM_H
7474

75+
#define LONGCAST (BLASLONG)
76+
#if defined(__BYTE_ORDER__)
77+
#if __GNUC__ < 9
78+
#undef LONGCAST
79+
#define LONGCAST
80+
#endif
81+
#endif
82+
7583
#define SBGEMM_DEFAULT_UNROLL_N 4
7684
#define SBGEMM_DEFAULT_UNROLL_M 8
7785
#define SBGEMM_DEFAULT_UNROLL_MN 32
@@ -2088,7 +2096,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
20882096
#ifdef PPCG4
20892097
#define GEMM_DEFAULT_OFFSET_A 0
20902098
#define GEMM_DEFAULT_OFFSET_B 1024
2091-
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL
2099+
#define GEMM_DEFAULT_ALIGN LONGCAST 0x0ffffUL
20922100

20932101
#define SGEMM_DEFAULT_UNROLL_M 16
20942102
#define SGEMM_DEFAULT_UNROLL_N 4
@@ -2119,7 +2127,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
21192127

21202128
#define GEMM_DEFAULT_OFFSET_A 2688
21212129
#define GEMM_DEFAULT_OFFSET_B 3072
2122-
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
2130+
#define GEMM_DEFAULT_ALIGN LONGCAST 0x03fffUL
21232131

21242132
#if defined(__BYTE_ORDER__)&&(__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
21252133
#define SGEMM_DEFAULT_UNROLL_M 4
@@ -2168,7 +2176,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
21682176

21692177
#define GEMM_DEFAULT_OFFSET_A (32 * 0)
21702178
#define GEMM_DEFAULT_OFFSET_B (32 * 0)
2171-
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL
2179+
#define GEMM_DEFAULT_ALIGN LONGCAST 0x0ffffUL
21722180

21732181
#define SGEMM_DEFAULT_UNROLL_M 4
21742182
#define SGEMM_DEFAULT_UNROLL_N 4
@@ -2204,7 +2212,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
22042212

22052213
#define GEMM_DEFAULT_OFFSET_A (32 * 0)
22062214
#define GEMM_DEFAULT_OFFSET_B (32 * 0)
2207-
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL
2215+
#define GEMM_DEFAULT_ALIGN LONGCAST 0x0ffffUL
22082216

22092217
#define SGEMM_DEFAULT_UNROLL_M 8
22102218
#define SGEMM_DEFAULT_UNROLL_N 4
@@ -2239,7 +2247,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
22392247
#if defined(POWER3) || defined(POWER4) || defined(POWER5)
22402248
#define GEMM_DEFAULT_OFFSET_A 0
22412249
#define GEMM_DEFAULT_OFFSET_B 2048
2242-
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL
2250+
#define GEMM_DEFAULT_ALIGN LONGCAST 0x0ffffUL
22432251

22442252
#define SGEMM_DEFAULT_UNROLL_M 4
22452253
#define SGEMM_DEFAULT_UNROLL_N 4
@@ -2312,7 +2320,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23122320

23132321
#define GEMM_DEFAULT_OFFSET_A 384
23142322
#define GEMM_DEFAULT_OFFSET_B 1024
2315-
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
2323+
#define GEMM_DEFAULT_ALIGN LONGCAST 0x03fffUL
23162324

23172325
#define SGEMM_DEFAULT_UNROLL_M 4
23182326
#define SGEMM_DEFAULT_UNROLL_N 4
@@ -2345,7 +2353,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23452353
#define GEMM_DEFAULT_OFFSET_A 0
23462354
#define GEMM_DEFAULT_OFFSET_B 65536
23472355

2348-
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL
2356+
#define GEMM_DEFAULT_ALIGN LONGCAST 0x0ffffUL
23492357
#if defined(__32BIT__)
23502358
#warning using BINARY32==POWER6
23512359
#define SGEMM_DEFAULT_UNROLL_M 4
@@ -2398,7 +2406,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23982406

23992407
#define GEMM_DEFAULT_OFFSET_A 0
24002408
#define GEMM_DEFAULT_OFFSET_B 65536
2401-
#define GEMM_DEFAULT_ALIGN 0x0ffffUL
2409+
#define GEMM_DEFAULT_ALIGN LONGCAST 0x0ffffUL
24022410

24032411
#define SWITCH_RATIO 16
24042412
#define GEMM_PREFERED_SIZE 16
@@ -2437,7 +2445,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24372445

24382446
#define GEMM_DEFAULT_OFFSET_A 0
24392447
#define GEMM_DEFAULT_OFFSET_B 65536
2440-
#define GEMM_DEFAULT_ALIGN 0x0ffffUL
2448+
#define GEMM_DEFAULT_ALIGN LONGCAST 0x0ffffUL
24412449

24422450
#define SWITCH_RATIO 16
24432451
#define GEMM_PREFERED_SIZE 16

0 commit comments

Comments
 (0)