Merge pull request #7 from xianyi/develop

martin-frbg · web-flow · commit 018dec858852 · 2021-01-10T17:09:46.000+01:00
rebase
diff --git a/Makefile.system b/Makefile.system
@@ -21,6 +21,8 @@ ifeq ($(ARCH), amd64)
 override ARCH=x86_64
 else ifeq ($(ARCH), powerpc64)
 override ARCH=power
+else ifeq ($(ARCH), powerpc64le)
+override ARCH=power
 else ifeq ($(ARCH), powerpc)
 override ARCH=power
 else ifeq ($(ARCH), i386)
diff --git a/README.md b/README.md
@@ -13,10 +13,14 @@ Drone CI: [![Build Status](https://cloud.drone.io/api/badges/xianyi/OpenBLAS/sta
 
 ## Introduction
 
-OpenBLAS is an optimized Basic Linear Algebra Subprograms library based on GotoBLAS2 1.13 BSD version.
+OpenBLAS is an optimized BLAS (Basic Linear Algebra Subprograms) library based on GotoBLAS2 1.13 BSD version.
 
 Please read the documentation on the OpenBLAS wiki pages: <https://github.com/xianyi/OpenBLAS/wiki>.
 
+For a general introduction to the BLAS routines, please refer to the extensive documentation of their reference implementation hosted at netlib:
+<https://www.netlib.org/blas>. On that site you will likewise find documentation for the reference implementation of the higher-level library LAPACK - the **L**inear **A**lgebra **Pack**age that comes included with OpenBLAS. If you are looking for a general primer or refresher on Linear Algebra, the set of six
+20-minute lecture videos by Prof. Gilbert Strang on either MIT OpenCourseWare <https://ocw.mit.edu/resources/res-18-010-a-2020-vision-of-linear-algebra-spring-2020/> or Youtube <https://www.youtube.com/playlist?list=PLUl4u3cNGP61iQEFiWLE21EJCxwmWvvek> may be helpful.
+
 ## Binary Packages
 
 We provide official binary packages for the following platform:
diff --git a/getarch.c b/getarch.c
@@ -1375,6 +1375,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #ifdef __riscv
 #include "cpuid_riscv64.c"
+#define OPENBLAS_SUPPORTED
 #endif
 
 #ifdef __arm__
diff --git a/kernel/arm64/KERNEL.NEOVERSEN1 b/kernel/arm64/KERNEL.NEOVERSEN1
@@ -91,10 +91,10 @@ IDAMAXKERNEL   = iamax_thunderx2t99.c
 ICAMAXKERNEL   = izamax_thunderx2t99.c
 IZAMAXKERNEL   = izamax_thunderx2t99.c
 
-SNRM2KERNEL    = nrm2.S
-DNRM2KERNEL    = nrm2.S
-CNRM2KERNEL    = znrm2.S
-ZNRM2KERNEL    = znrm2.S
+SNRM2KERNEL    = scnrm2_thunderx2t99.c
+DNRM2KERNEL    = dznrm2_thunderx2t99.c
+CNRM2KERNEL    = scnrm2_thunderx2t99.c
+ZNRM2KERNEL    = dznrm2_thunderx2t99.c
 
 DDOTKERNEL     = dot_thunderx2t99.c
 SDOTKERNEL     = dot_thunderx2t99.c
diff --git a/kernel/arm64/KERNEL.THUNDERX2T99 b/kernel/arm64/KERNEL.THUNDERX2T99
@@ -153,12 +153,12 @@ IDAMAXKERNEL   = iamax_thunderx2t99.c
 ICAMAXKERNEL   = izamax_thunderx2t99.c
 IZAMAXKERNEL   = izamax_thunderx2t99.c
 
-SNRM2KERNEL    = nrm2.S
-CNRM2KERNEL    = nrm2.S
+SNRM2KERNEL    = scnrm2_thunderx2t99.c
+CNRM2KERNEL    = scnrm2_thunderx2t99.c
 #DNRM2KERNEL    = dznrm2_thunderx2t99_fast.c
 #ZNRM2KERNEL    = dznrm2_thunderx2t99_fast.c
-DNRM2KERNEL    = znrm2.S
-ZNRM2KERNEL    = znrm2.S
+DNRM2KERNEL    = dznrm2_thunderx2t99.c
+ZNRM2KERNEL    = dznrm2_thunderx2t99.c
 
 
 DDOTKERNEL     = dot_thunderx2t99.c
diff --git a/kernel/arm64/KERNEL.THUNDERX3T110 b/kernel/arm64/KERNEL.THUNDERX3T110
@@ -153,16 +153,13 @@ IDAMAXKERNEL   = iamax_thunderx2t99.c
 ICAMAXKERNEL   = izamax_thunderx2t99.c
 IZAMAXKERNEL   = izamax_thunderx2t99.c
 
-#SNRM2KERNEL    = scnrm2_thunderx2t99.c
-#CNRM2KERNEL    = scnrm2_thunderx2t99.c
-##DNRM2KERNEL    = dznrm2_thunderx2t99_fast.c
-##ZNRM2KERNEL    = dznrm2_thunderx2t99_fast.c
-#DNRM2KERNEL    = dznrm2_thunderx2t99.c
-#ZNRM2KERNEL    = dznrm2_thunderx2t99.c
-SNRM2KERNEL = nrm2.S
-DNRM2KERNEL = nrm2.S
-CNRM2KERNEL = znrm2.S
-ZNRM2KERNEL = znrm2.S
+SNRM2KERNEL    = scnrm2_thunderx2t99.c
+CNRM2KERNEL    = scnrm2_thunderx2t99.c
+#DNRM2KERNEL    = dznrm2_thunderx2t99_fast.c
+#ZNRM2KERNEL    = dznrm2_thunderx2t99_fast.c
+DNRM2KERNEL    = dznrm2_thunderx2t99.c
+ZNRM2KERNEL    = dznrm2_thunderx2t99.c
+
 
 DDOTKERNEL     = dot_thunderx2t99.c
 SDOTKERNEL     = dot_thunderx2t99.c
diff --git a/kernel/arm64/dznrm2_thunderx2t99.c b/kernel/arm64/dznrm2_thunderx2t99.c
@@ -58,6 +58,7 @@ extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n
 #define CUR_MAXINV	"d8"
 #define CUR_MAXINV_V	"v8.2d"
 #define CUR_MAX_V	"v8.2d"
+#define REGINF		"d9"
 
 static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x,
 		         double *ssq, double *scale)
@@ -79,8 +80,10 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x,
 	"	ble	9f //nrm2_kernel_L999			\n"
 
 	"1: //nrm2_kernel_F_BEGIN:				\n"
+	"	mov	x6, #0x7FF0000000000000 //+Infinity	\n"
 	"	fmov	"REGZERO", xzr				\n"
 	"	fmov	"REGONE", #1.0				\n"
+	"	fmov	"REGINF", x6				\n"
 	"	lsl	"INC_X", "INC_X", #"INC_SHIFT"		\n"
 	"	mov	"J", "N"				\n"
 	"	cmp	"J", xzr				\n"
@@ -104,6 +107,8 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x,
 	"	ldr	d4, ["X"]				\n"
 	"	fabs	d4, d4					\n"
 	"	fmax	"CUR_MAX", "SCALE", d4			\n"
+	"	fcmp	"CUR_MAX", "REGINF"			\n"
+	"	beq	10f					\n"
 	"	fdiv	"SCALE", "SCALE", "CUR_MAX"		\n"
 	"	fmul	"SCALE", "SCALE", "SCALE"		\n"
 	"	fmul	"SSQ", "SSQ", "SCALE"			\n"
@@ -116,6 +121,8 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x,
 	"	ldr	d3, ["X", #8]				\n"
 	"	fabs	d3, d3					\n"
 	"	fmax	"CUR_MAX", "SCALE", d3			\n"
+	"	fcmp	"CUR_MAX", "REGINF"			\n"
+	"	beq	10f					\n"
 	"	fdiv	"SCALE", "SCALE", "CUR_MAX"		\n"
 	"	fmul	"SCALE", "SCALE", "SCALE"		\n"
 	"	fmul	"SSQ", "SSQ", "SCALE"			\n"
@@ -158,6 +165,8 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x,
 	"	fmaxp	v24.2d, v24.2d, v26.2d			\n"
 	"	fmaxp	v24.2d, v24.2d, v24.2d			\n"
 	"	fmax	"CUR_MAX", "SCALE", d24			\n"
+	"	fcmp	"CUR_MAX", "REGINF"			\n"
+	"	beq	10f					\n"
 	"	fdiv	"CUR_MAXINV", "REGONE", "CUR_MAX"	\n"
 	"	//dup	"CUR_MAX_V", v7.d[0]			\n"
 	"	fdiv	"SCALE", "SCALE", "CUR_MAX"		\n"
@@ -217,6 +226,8 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x,
 	"	fmaxp	v24.2d, v24.2d, v26.2d			\n"
 	"	fmaxp	v24.2d, v24.2d, v24.2d			\n"
 	"	fmax	"CUR_MAX", "SCALE", d24			\n"
+	"	fcmp	"CUR_MAX", "REGINF"			\n"
+	"	beq	10f					\n"
 	"	fdiv	"CUR_MAXINV", "REGONE", "CUR_MAX"	\n"
 	"	//dup	"CUR_MAX_V", v7.d[0]			\n"
 	"	fdiv	"SCALE", "SCALE", "CUR_MAX"		\n"
@@ -265,6 +276,8 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x,
 	"	ldr	d4, ["X"]				\n"
 	"	fabs	d4, d4					\n"
 	"	fmax	"CUR_MAX", "SCALE", d4			\n"
+	"	fcmp	"CUR_MAX", "REGINF"			\n"
+	"	beq	10f					\n"
 	"	fdiv	"SCALE", "SCALE", "CUR_MAX"		\n"
 	"	fmul	"SCALE", "SCALE", "SCALE"		\n"
 	"	fmul	"SSQ", "SSQ", "SCALE"			\n"
@@ -276,6 +289,8 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x,
 	"	ldr	d3, ["X", #8]				\n"
 	"	fabs	d3, d3					\n"
 	"	fmax	"CUR_MAX", "SCALE", d3			\n"
+	"	fcmp	"CUR_MAX", "REGINF"			\n"
+	"	beq	10f					\n"
 	"	fdiv	"SCALE", "SCALE", "CUR_MAX"		\n"
 	"	fmul	"SCALE", "SCALE", "SCALE"		\n"
 	"	fmul	"SSQ", "SSQ", "SCALE"			\n"
@@ -291,6 +306,11 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x,
 	"9: //nrm2_kernel_L999:					\n"
 	"	str	"SSQ", [%[SSQ_]]			\n"
 	"	str	"SCALE", [%[SCALE_]]			\n"
+	"	b	11f					\n"
+	"10:							\n"
+	"	str	"REGINF", [%[SSQ_]]			\n"
+	"	str	"REGINF", [%[SCALE_]]			\n"
+	"11:							\n"
 
 	:
 	: [SSQ_]    "r"  (ssq),			//%0
@@ -300,7 +320,7 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x,
 	  [INCX_]   "r"  (inc_x)		//%4
 	: "cc",
 	  "memory",
-	  "x0", "x1", "x2", "x3", "x4", "x5",
+	  "x0", "x1", "x2", "x3", "x4", "x5", "x6",
 	  "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8"
 	);
 
@@ -359,6 +379,12 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 			cur_ssq = *ptr;
 			cur_scale = *(ptr + 1);
 
+			if (cur_ssq == INFINITY) {
+				ssq = INFINITY;
+				scale = INFINITY;
+				break;
+			}
+
 			if (cur_scale != 0) {
 				if (cur_scale > scale) {
 					scale = (scale / cur_scale);
diff --git a/kernel/power/cswap.c b/kernel/power/cswap.c
@@ -36,9 +36,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 
-#if defined(POWER8)  || defined(POWER9) || defined(POWER10)
 #if defined(__VEC__) || defined(__ALTIVEC__)
+#if defined(POWER8)  || defined(POWER9)
 #include "cswap_microk_power8.c"
+#elif defined(POWER10)
+#include "cswap_microk_power10.c"
 #endif
 #endif
 
diff --git a/kernel/power/cswap_microk_power10.c b/kernel/power/cswap_microk_power10.c
@@ -0,0 +1,127 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#if defined(DOUBLE)
+#define HAVE_KERNEL_16 1
+static void zswap_kernel_16 (long n, double *x, double *y)
+#else
+#define HAVE_KERNEL_32 1
+static void cswap_kernel_32 (long n, float *x, float *y)
+#endif
+{
+  __asm__
+    (
+       ".align	5		\n"
+     "one%=:				\n\t"
+       "lxvp            32, 0(%4)       \n\t"
+       "lxvp            34, 32(%4)      \n\t"
+       "lxvp            36, 64(%4)      \n\t"
+       "lxvp            38, 96(%4)      \n\t"
+
+       "lxvp            40, 128(%4)     \n\t"
+       "lxvp            42, 160(%4)     \n\t"
+       "lxvp            44, 192(%4)     \n\t"
+       "lxvp            46, 224(%4)     \n\t"
+
+       "lxvp            48, 0(%3)       \n\t"
+       "lxvp            50, 32(%3)      \n\t"
+       "lxvp            52, 64(%3)      \n\t"
+       "lxvp            54, 96(%3)      \n\t"
+
+       "lxvp            56, 128(%3)     \n\t"
+       "lxvp            58, 160(%3)     \n\t"
+       "lxvp            60, 192(%3)     \n\t"
+       "lxvp            62, 224(%3)     \n\t"
+
+
+       "stxv		33, 0(%3)	\n\t"
+       "stxv		32, 16(%3)	\n\t"
+       "stxv		35, 32(%3)	\n\t"
+       "stxv		34, 48(%3)	\n\t"
+       "stxv		37, 64(%3)	\n\t"
+       "stxv		36, 80(%3)	\n\t"
+       "stxv		39, 96(%3)	\n\t"
+       "stxv		38, 112(%3)	\n\t"
+
+       "addi		%3, %3, 128	\n\t"
+
+       "stxv		41, 0(%3)	\n\t"
+       "stxv		40, 16(%3)	\n\t"
+       "stxv		43, 32(%3)	\n\t"
+       "stxv		42, 48(%3)	\n\t"
+       "stxv		45, 64(%3)	\n\t"
+       "stxv		44, 80(%3)	\n\t"
+       "stxv		47, 96(%3)	\n\t"
+       "stxv		46, 112(%3)	\n\t"
+
+       "addi		%3, %3, 128	\n\t"
+
+       "stxv		49, 0(%4)	\n\t"
+       "stxv		48, 16(%4)	\n\t"
+       "stxv		51, 32(%4)	\n\t"
+       "stxv		50, 48(%4)	\n\t"
+       "stxv		53, 64(%4)	\n\t"
+       "stxv		52, 80(%4)	\n\t"
+       "stxv		55, 96(%4)	\n\t"
+       "stxv		54, 112(%4)	\n\t"
+
+       "addi		%4, %4, 128	\n\t"
+
+       "stxv		57, 0(%4)	\n\t"
+       "stxv		56, 16(%4)	\n\t"
+       "stxv		59, 32(%4)	\n\t"
+       "stxv		58, 48(%4)	\n\t"
+       "stxv		61, 64(%4)	\n\t"
+       "stxv		60, 80(%4)	\n\t"
+       "stxv		63, 96(%4)	\n\t"
+       "stxv		62, 112(%4)	\n\t"
+
+       "addi		%4, %4, 128	\n\t"
+
+#if defined(DOUBLE)
+       "addic.		%2, %2, -16	\n\t"
+#else
+       "addic.		%2, %2, -32	\n\t"
+#endif
+       "bgt		one%=		\n"
+
+     "#n=%2 x=%0=%3 y=%1=%4"
+     :
+       "+m" (*x),
+       "+m" (*y),
+       "+r" (n),	// 2
+       "+b" (x),	// 3
+       "+b" (y)		// 4
+     :
+     :
+       "cr0",
+       "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
+       "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
+       "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55",
+       "vs56","vs57","vs58","vs59","vs60","vs61","vs62","vs63"
+     );
+}
diff --git a/kernel/power/dswap.c b/kernel/power/dswap.c
@@ -35,9 +35,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-#if defined(POWER8) || defined(POWER9) || defined(POWER10)
 #if defined(__VEC__) || defined(__ALTIVEC__)
+#if defined(POWER8) || defined(POWER9)
 #include "dswap_microk_power8.c"
+#elif defined(POWER10)
+#include "swap_microk_power10.c"
 #endif
 #endif
 
@@ -115,12 +117,30 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
 	if ( (inc_x == 1) && (inc_y == 1 ))
 	{
 
+#if defined(POWER10)
+		if ( n >= 32 )
+		{
+			BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3;
+			for (i = 0; i < align; i++) {
+				temp = y[i];
+				y[i] = x[i];
+				x[i] = temp;
+			}
+		}
+		BLASLONG n1 = (n-i) & -32;
+		if ( n1 > 0 )
+		{
+			dswap_kernel_32(n1,&x[i], &y[i]);
+			i+=n1;
+		}
+#else
 		BLASLONG n1 = n & -32;
 		if ( n1 > 0 )
 		{
 			dswap_kernel_32(n1, x, y);
 			i=n1;
 		}
+#endif
 
 		while(i < n)
 		{
diff --git a/kernel/power/sswap.c b/kernel/power/sswap.c
diff --git a/kernel/power/swap_microk_power10.c b/kernel/power/swap_microk_power10.c
diff --git a/kernel/power/zswap.c b/kernel/power/zswap.c