Skip to content

Commit f6f0d13

Browse files
authored
Merge pull request #3842 from Mousius/sve-dot
Add SVE implementation for sdot/ddot
2 parents b6a4ef9 + eea006a commit f6f0d13

File tree

10 files changed

+204
-81
lines changed

10 files changed

+204
-81
lines changed

Makefile.arm64

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -70,12 +70,12 @@ endif
7070
ifeq ($(CORE), NEOVERSEN1)
7171
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
7272
ifeq ($(GCCVERSIONGTEQ9), 1)
73-
CCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1
73+
CCOMMON_OPT += -march=armv8.2-a+sve -mtune=neoverse-n1
7474
ifneq ($(F_COMPILER), NAG)
7575
FCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1
7676
endif
7777
else
78-
CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
78+
CCOMMON_OPT += -march=armv8.2-a+sve -mtune=cortex-a72
7979
ifneq ($(F_COMPILER), NAG)
8080
FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
8181
endif
@@ -94,12 +94,12 @@ ifeq ($(CORE), NEOVERSEV1)
9494
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
9595
ifeq ($(GCCVERSIONGTEQ10), 1)
9696
ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11)))
97-
CCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1
97+
CCOMMON_OPT += -march=armv8.4-a+sve -mtune=neoverse-v1
9898
ifneq ($(F_COMPILER), NAG)
9999
FCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1
100100
endif
101101
else
102-
CCOMMON_OPT += -march=armv8.4-a -mtune=native
102+
CCOMMON_OPT += -march=armv8.4-a+sve -mtune=native
103103
ifneq ($(F_COMPILER), NAG)
104104
FCOMMON_OPT += -march=armv8.4-a -mtune=native
105105
endif
@@ -133,7 +133,7 @@ ifneq ($(F_COMPILER), NAG)
133133
FCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2
134134
endif
135135
else
136-
CCOMMON_OPT += -march=armv8.5-a -mtune=native
136+
CCOMMON_OPT += -march=armv8.5-a+sve -mtune=native
137137
ifneq ($(F_COMPILER), NAG)
138138
FCOMMON_OPT += -march=armv8.5-a -mtune=native
139139
endif

getarch.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1410,7 +1410,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
14101410
"-DL2_SIZE=1048576 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
14111411
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
14121412
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8 " \
1413-
"-march=armv8.4-a -mtune=neoverse-v1"
1413+
"-march=armv8.4-a+sve -mtune=neoverse-v1"
14141414
#define LIBNAME "neoversev1"
14151415
#define CORENAME "NEOVERSEV1"
14161416
#endif

kernel/arm64/KERNEL.NEOVERSEN1

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -96,8 +96,8 @@ DNRM2KERNEL = dznrm2_thunderx2t99.c
9696
CNRM2KERNEL = scnrm2_thunderx2t99.c
9797
ZNRM2KERNEL = dznrm2_thunderx2t99.c
9898

99-
DDOTKERNEL = dot_thunderx2t99.c
100-
SDOTKERNEL = dot_thunderx2t99.c
99+
DDOTKERNEL = dot.c
100+
SDOTKERNEL = dot.c
101101
CDOTKERNEL = zdot_thunderx2t99.c
102102
ZDOTKERNEL = zdot_thunderx2t99.c
103103
DSDOTKERNEL = dot.S

kernel/arm64/KERNEL.NEOVERSEN2

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -96,8 +96,8 @@ DNRM2KERNEL = dznrm2_thunderx2t99.c
9696
CNRM2KERNEL = scnrm2_thunderx2t99.c
9797
ZNRM2KERNEL = dznrm2_thunderx2t99.c
9898

99-
DDOTKERNEL = dot_thunderx2t99.c
100-
SDOTKERNEL = dot_thunderx2t99.c
99+
DDOTKERNEL = dot.c
100+
SDOTKERNEL = dot.c
101101
CDOTKERNEL = zdot_thunderx2t99.c
102102
ZDOTKERNEL = zdot_thunderx2t99.c
103103
DSDOTKERNEL = dot.S

kernel/arm64/KERNEL.NEOVERSEV1

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -96,8 +96,8 @@ DNRM2KERNEL = dznrm2_thunderx2t99.c
9696
CNRM2KERNEL = scnrm2_thunderx2t99.c
9797
ZNRM2KERNEL = dznrm2_thunderx2t99.c
9898

99-
DDOTKERNEL = dot_thunderx2t99.c
100-
SDOTKERNEL = dot_thunderx2t99.c
99+
DDOTKERNEL = dot.c
100+
SDOTKERNEL = dot.c
101101
CDOTKERNEL = zdot_thunderx2t99.c
102102
ZDOTKERNEL = zdot_thunderx2t99.c
103103
DSDOTKERNEL = dot.S

kernel/arm64/KERNEL.THUNDERX2T99

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -161,8 +161,8 @@ DNRM2KERNEL = dznrm2_thunderx2t99.c
161161
ZNRM2KERNEL = dznrm2_thunderx2t99.c
162162

163163

164-
DDOTKERNEL = dot_thunderx2t99.c
165-
SDOTKERNEL = dot_thunderx2t99.c
164+
DDOTKERNEL = dot.c
165+
SDOTKERNEL = dot.c
166166
CDOTKERNEL = zdot_thunderx2t99.c
167167
ZDOTKERNEL = zdot_thunderx2t99.c
168168
DSDOTKERNEL = dot.S

kernel/arm64/KERNEL.THUNDERX3T110

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -161,8 +161,8 @@ DNRM2KERNEL = dznrm2_thunderx2t99.c
161161
ZNRM2KERNEL = dznrm2_thunderx2t99.c
162162

163163

164-
DDOTKERNEL = dot_thunderx2t99.c
165-
SDOTKERNEL = dot_thunderx2t99.c
164+
DDOTKERNEL = dot.c
165+
SDOTKERNEL = dot.c
166166
CDOTKERNEL = zdot_thunderx2t99.c
167167
ZDOTKERNEL = zdot_thunderx2t99.c
168168
DSDOTKERNEL = dot.S

kernel/arm64/dot.c

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
/***************************************************************************
2+
Copyright (c) 2017, The OpenBLAS Project
3+
Copyright (c) 2022, Arm Ltd
4+
All rights reserved.
5+
Redistribution and use in source and binary forms, with or without
6+
modification, are permitted provided that the following conditions are
7+
met:
8+
1. Redistributions of source code must retain the above copyright
9+
notice, this list of conditions and the following disclaimer.
10+
2. Redistributions in binary form must reproduce the above copyright
11+
notice, this list of conditions and the following disclaimer in
12+
the documentation and/or other materials provided with the
13+
distribution.
14+
3. Neither the name of the OpenBLAS project nor the names of
15+
its contributors may be used to endorse or promote products
16+
derived from this software without specific prior written permission.
17+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20+
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
21+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
23+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
24+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
25+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
26+
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27+
*****************************************************************************/
28+
29+
30+
#include "common.h"
31+
32+
// Some compilers will report feature support for SVE without the appropriate
33+
// header available
34+
#ifdef HAVE_SVE
35+
#if defined __has_include
36+
#if __has_include(<arm_sve.h>) && __ARM_FEATURE_SVE
37+
#define USE_SVE
38+
#endif
39+
#endif
40+
#endif
41+
42+
#ifdef USE_SVE
43+
#include "dot_kernel_sve.c"
44+
#endif
45+
#include "dot_kernel_asimd.c"
46+
47+
#if defined(SMP)
48+
extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n,
49+
BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb,
50+
void *c, BLASLONG ldc, int (*function)(), int nthreads);
51+
#endif
52+
53+
static RETURN_TYPE dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
54+
{
55+
RETURN_TYPE dot = 0.0 ;
56+
57+
if ( n <= 0 ) return dot;
58+
59+
#ifdef USE_SVE
60+
if (inc_x == 1 && inc_y == 1) {
61+
return dot_kernel_sve(n, x, y);
62+
}
63+
#endif
64+
65+
return dot_kernel_asimd(n, x, inc_x, y, inc_y);
66+
}
67+
68+
#if defined(SMP)
69+
static int dot_thread_function(BLASLONG n, BLASLONG dummy0,
70+
BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y,
71+
BLASLONG inc_y, FLOAT *result, BLASLONG dummy3)
72+
{
73+
*(RETURN_TYPE *)result = dot_compute(n, x, inc_x, y, inc_y);
74+
75+
return 0;
76+
}
77+
#endif
78+
79+
RETURN_TYPE CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
80+
{
81+
#if defined(SMP)
82+
int nthreads;
83+
FLOAT dummy_alpha;
84+
#endif
85+
RETURN_TYPE dot = 0.0;
86+
87+
#if defined(SMP)
88+
if (inc_x == 0 || inc_y == 0 || n <= 10000)
89+
nthreads = 1;
90+
else
91+
nthreads = num_cpu_avail(1);
92+
93+
if (nthreads == 1) {
94+
dot = dot_compute(n, x, inc_x, y, inc_y);
95+
} else {
96+
int mode, i;
97+
char result[MAX_CPU_NUMBER * sizeof(double) * 2];
98+
RETURN_TYPE *ptr;
99+
100+
#if !defined(DOUBLE)
101+
mode = BLAS_SINGLE | BLAS_REAL;
102+
#else
103+
mode = BLAS_DOUBLE | BLAS_REAL;
104+
#endif
105+
106+
blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha,
107+
x, inc_x, y, inc_y, result, 0,
108+
( void *)dot_thread_function, nthreads);
109+
110+
ptr = (RETURN_TYPE *)result;
111+
for (i = 0; i < nthreads; i++) {
112+
dot = dot + (*ptr);
113+
ptr = (RETURN_TYPE *)(((char *)ptr) + sizeof(double) * 2);
114+
}
115+
}
116+
#else
117+
dot = dot_compute(n, x, inc_x, y, inc_y);
118+
#endif
119+
120+
return dot;
121+
}

kernel/arm64/dot_thunderx2t99.c renamed to kernel/arm64/dot_kernel_asimd.c

Lines changed: 1 addition & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -260,18 +260,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
260260
" faddp "OUT", v0.2d \n"
261261
#endif /* !defined(DOUBLE) */
262262

263-
#if defined(SMP)
264-
extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n,
265-
BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb,
266-
void *c, BLASLONG ldc, int (*function)(), int nthreads);
267-
#endif
268-
269-
static RETURN_TYPE dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
263+
static RETURN_TYPE dot_kernel_asimd(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
270264
{
271265
RETURN_TYPE dot = 0.0;
272-
273-
if ( n < 0 ) return dot;
274-
275266
BLASLONG j = 0;
276267

277268
__asm__ __volatile__ (
@@ -352,58 +343,3 @@ static RETURN_TYPE dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, B
352343

353344
return dot;
354345
}
355-
356-
#if defined(SMP)
357-
static int dot_thread_function(BLASLONG n, BLASLONG dummy0,
358-
BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y,
359-
BLASLONG inc_y, FLOAT *result, BLASLONG dummy3)
360-
{
361-
*(RETURN_TYPE *)result = dot_compute(n, x, inc_x, y, inc_y);
362-
363-
return 0;
364-
}
365-
#endif
366-
367-
RETURN_TYPE CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
368-
{
369-
#if defined(SMP)
370-
int nthreads;
371-
FLOAT dummy_alpha;
372-
#endif
373-
RETURN_TYPE dot = 0.0;
374-
375-
#if defined(SMP)
376-
if (inc_x == 0 || inc_y == 0 || n <= 10000)
377-
nthreads = 1;
378-
else
379-
nthreads = num_cpu_avail(1);
380-
381-
if (nthreads == 1) {
382-
dot = dot_compute(n, x, inc_x, y, inc_y);
383-
} else {
384-
int mode, i;
385-
char result[MAX_CPU_NUMBER * sizeof(double) * 2];
386-
RETURN_TYPE *ptr;
387-
388-
#if !defined(DOUBLE)
389-
mode = BLAS_SINGLE | BLAS_REAL;
390-
#else
391-
mode = BLAS_DOUBLE | BLAS_REAL;
392-
#endif
393-
394-
blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha,
395-
x, inc_x, y, inc_y, result, 0,
396-
( void *)dot_thread_function, nthreads);
397-
398-
ptr = (RETURN_TYPE *)result;
399-
for (i = 0; i < nthreads; i++) {
400-
dot = dot + (*ptr);
401-
ptr = (RETURN_TYPE *)(((char *)ptr) + sizeof(double) * 2);
402-
}
403-
}
404-
#else
405-
dot = dot_compute(n, x, inc_x, y, inc_y);
406-
#endif
407-
408-
return dot;
409-
}

kernel/arm64/dot_kernel_sve.c

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
/***************************************************************************
2+
Copyright (c) 2022, Arm Ltd
3+
All rights reserved.
4+
Redistribution and use in source and binary forms, with or without
5+
modification, are permitted provided that the following conditions are
6+
met:
7+
1. Redistributions of source code must retain the above copyright
8+
notice, this list of conditions and the following disclaimer.
9+
2. Redistributions in binary form must reproduce the above copyright
10+
notice, this list of conditions and the following disclaimer in
11+
the documentation and/or other materials provided with the
12+
distribution.
13+
3. Neither the name of the OpenBLAS project nor the names of
14+
its contributors may be used to endorse or promote products
15+
derived from this software without specific prior written permission.
16+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19+
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21+
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
22+
GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23+
HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24+
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
25+
THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
*****************************************************************************/
27+
28+
#include "common.h"
29+
30+
#include <arm_sve.h>
31+
32+
#ifdef DOUBLE
33+
#define SVE_TYPE svfloat64_t
34+
#define SVE_ZERO svdup_f64(0.0)
35+
#define SVE_WHILELT svwhilelt_b64
36+
#define SVE_ALL svptrue_b64()
37+
#define SVE_WIDTH svcntd()
38+
#else
39+
#define SVE_TYPE svfloat32_t
40+
#define SVE_ZERO svdup_f32(0.0)
41+
#define SVE_WHILELT svwhilelt_b32
42+
#define SVE_ALL svptrue_b32()
43+
#define SVE_WIDTH svcntw()
44+
#endif
45+
46+
static FLOAT dot_kernel_sve(BLASLONG n, FLOAT *x, FLOAT *y) {
47+
SVE_TYPE acc_a = SVE_ZERO;
48+
SVE_TYPE acc_b = SVE_ZERO;
49+
50+
BLASLONG sve_width = SVE_WIDTH;
51+
52+
for (BLASLONG i = 0; i < n; i += sve_width * 2) {
53+
svbool_t pg_a = SVE_WHILELT(i, n);
54+
svbool_t pg_b = SVE_WHILELT(i + sve_width, n);
55+
56+
SVE_TYPE x_vec_a = svld1(pg_a, &x[i]);
57+
SVE_TYPE y_vec_a = svld1(pg_a, &y[i]);
58+
SVE_TYPE x_vec_b = svld1(pg_b, &x[i + sve_width]);
59+
SVE_TYPE y_vec_b = svld1(pg_b, &y[i + sve_width]);
60+
61+
acc_a = svmla_m(pg_a, acc_a, x_vec_a, y_vec_a);
62+
acc_b = svmla_m(pg_b, acc_b, x_vec_b, y_vec_b);
63+
}
64+
65+
return svaddv(SVE_ALL, acc_a) + svaddv(SVE_ALL, acc_b);
66+
}

0 commit comments

Comments
 (0)