Skip to content

Commit b1f6c4a

Browse files
authored
Merge pull request #4160 from Mousius/sve-sniff
Add ARMV8SVE to AArch64 Dynamic Dispatch
2 parents 7976def + 24586bc commit b1f6c4a

25 files changed

+160
-84
lines changed

Makefile.system

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -668,6 +668,7 @@ DYNAMIC_CORE += NEOVERSEN1
668668
ifneq ($(NO_SVE), 1)
669669
DYNAMIC_CORE += NEOVERSEV1
670670
DYNAMIC_CORE += NEOVERSEN2
671+
DYNAMIC_CORE += ARMV8SVE
671672
endif
672673
DYNAMIC_CORE += CORTEXA55
673674
DYNAMIC_CORE += FALKOR

cmake/arch.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ if (DYNAMIC_ARCH)
4646
if (ARM64)
4747
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110)
4848
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 9.99)
49-
set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2)
49+
set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE)
5050
endif ()
5151
if (DYNAMIC_LIST)
5252
set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST})

driver/others/dynamic_arm64.c

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
/*********************************************************************/
22
/* Copyright 2009, 2010 The University of Texas at Austin. */
3+
/* Copyright 2023 The OpenBLAS Project */
34
/* All rights reserved. */
45
/* */
56
/* Redistribution and use in source and binary forms, with or */
@@ -109,6 +110,11 @@ extern gotoblas_t gotoblas_NEOVERSEN2;
109110
#else
110111
#define gotoblas_NEOVERSEN2 gotoblas_ARMV8
111112
#endif
113+
#ifdef DYN_ARMV8SVE
114+
extern gotoblas_t gotoblas_ARMV8SVE;
115+
#else
116+
#define gotoblas_ARMV8SVE gotoblas_ARMV8
117+
#endif
112118
#ifdef DYN_CORTEX_A55
113119
extern gotoblas_t gotoblas_CORTEXA55;
114120
#else
@@ -128,9 +134,11 @@ extern gotoblas_t gotoblas_NEOVERSEN1;
128134
#ifndef NO_SVE
129135
extern gotoblas_t gotoblas_NEOVERSEV1;
130136
extern gotoblas_t gotoblas_NEOVERSEN2;
137+
extern gotoblas_t gotoblas_ARMV8SVE;
131138
#else
132139
#define gotoblas_NEOVERSEV1 gotoblas_ARMV8
133140
#define gotoblas_NEOVERSEN2 gotoblas_ARMV8
141+
#define gotoblas_ARMV8SVE gotoblas_ARMV8
134142
#endif
135143
extern gotoblas_t gotoblas_THUNDERX3T110;
136144
extern gotoblas_t gotoblas_CORTEXA55;
@@ -140,7 +148,7 @@ extern void openblas_warning(int verbose, const char * msg);
140148
#define FALLBACK_VERBOSE 1
141149
#define NEOVERSEN1_FALLBACK "OpenBLAS : Your OS does not support SVE instructions. OpenBLAS is using Neoverse N1 kernels as a fallback, which may give poorer performance.\n"
142150

143-
#define NUM_CORETYPES 13
151+
#define NUM_CORETYPES 16
144152

145153
/*
146154
* In case asm/hwcap.h is outdated on the build system, make sure
@@ -173,6 +181,7 @@ static char *corename[] = {
173181
"neoversen2",
174182
"thunderx3t110",
175183
"cortexa55",
184+
"armv8sve",
176185
"unknown"
177186
};
178187

@@ -192,6 +201,7 @@ char *gotoblas_corename(void) {
192201
if (gotoblas == &gotoblas_NEOVERSEN2) return corename[12];
193202
if (gotoblas == &gotoblas_THUNDERX3T110) return corename[13];
194203
if (gotoblas == &gotoblas_CORTEXA55) return corename[14];
204+
if (gotoblas == &gotoblas_ARMV8SVE) return corename[15];
195205
return corename[NUM_CORETYPES];
196206
}
197207

@@ -226,6 +236,7 @@ static gotoblas_t *force_coretype(char *coretype) {
226236
case 12: return (&gotoblas_NEOVERSEN2);
227237
case 13: return (&gotoblas_THUNDERX3T110);
228238
case 14: return (&gotoblas_CORTEXA55);
239+
case 15: return (&gotoblas_ARMV8SVE);
229240
}
230241
snprintf(message, 128, "Core not found: %s\n", coretype);
231242
openblas_warning(1, message);
@@ -345,6 +356,12 @@ static gotoblas_t *get_coretype(void) {
345356
snprintf(coremsg, 128, "Unknown CPU model - implementer %x part %x\n",implementer,part);
346357
openblas_warning(1, coremsg);
347358
}
359+
#ifndef NO_SVE
360+
if ((getauxval(AT_HWCAP) & HWCAP_SVE)) {
361+
return &gotoblas_ARMV8SVE;
362+
}
363+
#endif
364+
348365
return NULL;
349366
#endif
350367
}

kernel/arm64/cgemm_ncopy_sve_v1.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
/*********************************************************************/
22
/* Copyright 2009, 2010 The University of Texas at Austin. */
3+
/* Copyright 2023 The OpenBLAS Project */
34
/* All rights reserved. */
45
/* */
56
/* Redistribution and use in source and binary forms, with or */
@@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
5253
boffset = b;
5354

5455
j = 0;
55-
svbool_t pg = svwhilelt_b32(j, n);
56+
svbool_t pg = svwhilelt_b32((uint64_t)j, (uint64_t)n);
5657
uint32_t active = svcntp_b32(svptrue_b32(), pg);
5758
do {
5859

@@ -69,7 +70,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
6970
aoffset += active * lda * 2;
7071

7172
j += svcntw();
72-
pg = svwhilelt_b32(j, n);
73+
pg = svwhilelt_b32((uint64_t)j, (uint64_t)n);
7374
active = svcntp_b32(svptrue_b32(), pg);
7475

7576

kernel/arm64/cgemm_tcopy_sve_v1.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
/*********************************************************************/
22
/* Copyright 2009, 2010 The University of Texas at Austin. */
3+
/* Copyright 2023 The OpenBLAS Project */
34
/* All rights reserved. */
45
/* */
56
/* Redistribution and use in source and binary forms, with or */
@@ -50,7 +51,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
5051
boffset = b;
5152

5253
j = 0;
53-
svbool_t pg = svwhilelt_b32(j, n);
54+
svbool_t pg = svwhilelt_b32((uint64_t)j, (uint64_t)n);
5455
uint32_t active = svcntp_b32(svptrue_b32(), pg);
5556
do {
5657

@@ -66,7 +67,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
6667
aoffset += active * 2;
6768

6869
j += svcntw();
69-
pg = svwhilelt_b32(j, n);
70+
pg = svwhilelt_b32((uint64_t)j, (uint64_t)n);
7071
active = svcntp_b32(svptrue_b32(), pg);
7172

7273
} while (svptest_any(svptrue_b32(), pg));

kernel/arm64/symm_lcopy_sve.c

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
/*********************************************************************/
22
/* Copyright 2009, 2010 The University of Texas at Austin. */
3+
/* Copyright 2023 The OpenBLAS Project */
34
/* All rights reserved. */
45
/* */
56
/* Redistribution and use in source and binary forms, with or */
@@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
5253
svint64_t one_vec = svdup_s64(1LL);
5354

5455
int64_t j = 0;
55-
svbool_t pg = svwhilelt_b64(j, n);
56+
svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
5657
int64_t active = svcntp_b64(svptrue_b64(), pg);
5758
svint64_t index_neg = svindex_s64(0LL, -1LL);
5859
svint64_t index = svindex_s64(0LL, 1LL);
@@ -86,7 +87,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
8687
posX += sve_size;
8788
posX_vec = svdup_s64(posX);
8889
j += sve_size;
89-
pg = svwhilelt_b64(j, n);
90+
pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
9091
active = svcntp_b64(svptrue_b64(), pg);
9192
} while (svptest_any(svptrue_b64(), pg));
9293

@@ -99,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
99100

100101
int32_t N = n;
101102
int32_t j = 0;
102-
svbool_t pg = svwhilelt_b32(j, N);
103+
svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
103104
int32_t active = svcntp_b32(svptrue_b32(), pg);
104105
svint32_t index_neg = svindex_s32(0, -1);
105106
svint32_t index = svindex_s32(0, 1);
@@ -133,7 +134,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
133134
posX += sve_size;
134135
posX_vec = svdup_s32(posX);
135136
j += sve_size;
136-
pg = svwhilelt_b32(j, N);
137+
pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
137138
active = svcntp_b32(svptrue_b32(), pg);
138139
} while (svptest_any(svptrue_b32(), pg));
139140

kernel/arm64/symm_ucopy_sve.c

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
/*********************************************************************/
22
/* Copyright 2009, 2010 The University of Texas at Austin. */
3+
/* Copyright 2023 The OpenBLAS Project */
34
/* All rights reserved. */
45
/* */
56
/* Redistribution and use in source and binary forms, with or */
@@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
5253
svint64_t one_vec = svdup_s64(1LL);
5354

5455
int64_t j = 0;
55-
svbool_t pg = svwhilelt_b64(j, n);
56+
svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
5657
int64_t active = svcntp_b64(svptrue_b64(), pg);
5758
svint64_t index_neg = svindex_s64(0LL, -1LL);
5859
svint64_t index = svindex_s64(0LL, 1LL);
@@ -86,7 +87,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
8687
posX += sve_size;
8788
posX_vec = svdup_s64(posX);
8889
j += sve_size;
89-
pg = svwhilelt_b64(j, n);
90+
pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
9091
active = svcntp_b64(svptrue_b64(), pg);
9192
} while (svptest_any(svptrue_b64(), pg));
9293

@@ -99,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
99100

100101
int32_t N = n;
101102
int32_t j = 0;
102-
svbool_t pg = svwhilelt_b32(j, N);
103+
svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
103104
int32_t active = svcntp_b32(svptrue_b32(), pg);
104105
svint32_t index_neg = svindex_s32(0, -1);
105106
svint32_t index = svindex_s32(0, 1);
@@ -133,7 +134,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
133134
posX += sve_size;
134135
posX_vec = svdup_s32(posX);
135136
j += sve_size;
136-
pg = svwhilelt_b32(j, N);
137+
pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
137138
active = svcntp_b32(svptrue_b32(), pg);
138139
} while (svptest_any(svptrue_b32(), pg));
139140

kernel/arm64/trsm_ltcopy_sve.c

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
/*********************************************************************/
22
/* Copyright 2009, 2010 The University of Texas at Austin. */
3+
/* Copyright 2023 The OpenBLAS Project */
34
/* All rights reserved. */
45
/* */
56
/* Redistribution and use in source and binary forms, with or */
@@ -55,12 +56,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
5556
jj = offset;
5657
#ifdef DOUBLE
5758
int64_t js = 0;
58-
svbool_t pn = svwhilelt_b64(js, n);
59+
svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
5960
int n_active = svcntp_b64(svptrue_b64(), pn);
6061
#else
6162
int32_t N = n;
6263
int32_t js = 0;
63-
svbool_t pn = svwhilelt_b32(js, N);
64+
svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
6465
int n_active = svcntp_b32(svptrue_b32(), pn);
6566
#endif
6667
do {
@@ -104,11 +105,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
104105

105106
js += n_active;
106107
#ifdef DOUBLE
107-
pn = svwhilelt_b64(js, n);
108+
pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
108109
n_active = svcntp_b64(svptrue_b64(), pn);
109110
} while (svptest_any(svptrue_b64(), pn));
110111
#else
111-
pn = svwhilelt_b32(js, N);
112+
pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
112113
n_active = svcntp_b32(svptrue_b32(), pn);
113114
} while (svptest_any(svptrue_b32(), pn));
114115
#endif

kernel/arm64/trsm_uncopy_sve.c

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
/*********************************************************************/
22
/* Copyright 2009, 2010 The University of Texas at Austin. */
3+
/* Copyright 2023 The OpenBLAS Project */
34
/* All rights reserved. */
45
/* */
56
/* Redistribution and use in source and binary forms, with or */
@@ -56,13 +57,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
5657
#ifdef DOUBLE
5758
int64_t js = 0;
5859
svint64_t index = svindex_s64(0LL, lda);
59-
svbool_t pn = svwhilelt_b64(js, n);
60+
svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
6061
int n_active = svcntp_b64(svptrue_b64(), pn);
6162
#else
6263
int32_t N = n;
6364
int32_t js = 0;
6465
svint32_t index = svindex_s32(0, lda);
65-
svbool_t pn = svwhilelt_b32(js, N);
66+
svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
6667
int n_active = svcntp_b32(svptrue_b32(), pn);
6768
#endif
6869
do {
@@ -106,11 +107,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
106107

107108
js += n_active;
108109
#ifdef DOUBLE
109-
pn = svwhilelt_b64(js, n);
110+
pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
110111
n_active = svcntp_b64(svptrue_b64(), pn);
111112
} while (svptest_any(svptrue_b64(), pn));
112113
#else
113-
pn = svwhilelt_b32(js, N);
114+
pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
114115
n_active = svcntp_b32(svptrue_b32(), pn);
115116
} while (svptest_any(svptrue_b32(), pn));
116117
#endif

kernel/arm64/trsm_utcopy_sve.c

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
/*********************************************************************/
22
/* Copyright 2009, 2010 The University of Texas at Austin. */
3+
/* Copyright 2023 The OpenBLAS Project */
34
/* All rights reserved. */
45
/* */
56
/* Redistribution and use in source and binary forms, with or */
@@ -55,12 +56,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
5556
jj = offset;
5657
#ifdef DOUBLE
5758
int64_t js = 0;
58-
svbool_t pn = svwhilelt_b64(js, n);
59+
svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
5960
int n_active = svcntp_b64(svptrue_b64(), pn);
6061
#else
6162
int32_t N = n;
6263
int32_t js = 0;
63-
svbool_t pn = svwhilelt_b32(js, N);
64+
svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
6465
int n_active = svcntp_b32(svptrue_b32(), pn);
6566
#endif
6667
do {
@@ -104,11 +105,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
104105

105106
js += n_active;
106107
#ifdef DOUBLE
107-
pn = svwhilelt_b64(js, n);
108+
pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
108109
n_active = svcntp_b64(svptrue_b64(), pn);
109110
} while (svptest_any(svptrue_b64(), pn));
110111
#else
111-
pn = svwhilelt_b32(js, N);
112+
pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
112113
n_active = svcntp_b32(svptrue_b32(), pn);
113114
} while (svptest_any(svptrue_b32(), pn));
114115
#endif

0 commit comments

Comments
 (0)