Skip to content

Commit e4f83d4

Browse files
authored
Merge pull request #5041 from martin-frbg/issue2715
Identify all cores in ARM64 autodetection, return fastest TARGET and performance group sizes
2 parents a63282a + 7fd73a4 commit e4f83d4

File tree

2 files changed

+208
-59
lines changed

2 files changed

+208
-59
lines changed

cpuid_arm64.c

Lines changed: 161 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2626
*****************************************************************************/
2727

28+
#include <stdlib.h>
2829
#include <string.h>
2930
#ifdef __APPLE__
3031
#include <sys/sysctl.h>
@@ -33,6 +34,20 @@ size_t length=sizeof(value);
3334
int64_t value64;
3435
size_t length64=sizeof(value64);
3536
#endif
37+
#if (defined OS_LINUX || defined OS_ANDROID)
38+
#include <asm/hwcap.h>
39+
#include <sys/auxv.h>
40+
#ifndef HWCAP_CPUID
41+
#define HWCAP_CPUID (1 << 11)
42+
#endif
43+
#ifndef HWCAP_SVE
44+
#define HWCAP_SVE (1 << 22)
45+
#endif
46+
47+
#define get_cpu_ftr(id, var) ({ \
48+
__asm__ __volatile__ ("mrs %0, "#id : "=r" (var)); \
49+
})
50+
#endif
3651

3752
#define CPU_UNKNOWN 0
3853
#define CPU_ARMV8 1
@@ -42,11 +57,11 @@ size_t length64=sizeof(value64);
4257
#define CPU_CORTEXA57 3
4358
#define CPU_CORTEXA72 4
4459
#define CPU_CORTEXA73 5
45-
#define CPU_CORTEXA76 23
60+
#define CPU_CORTEXA76 23
4661
#define CPU_NEOVERSEN1 11
4762
#define CPU_NEOVERSEV1 16
4863
#define CPU_NEOVERSEN2 17
49-
#define CPU_NEOVERSEV2 24
64+
#define CPU_NEOVERSEV2 24
5065
#define CPU_CORTEXX1 18
5166
#define CPU_CORTEXX2 19
5267
#define CPU_CORTEXA510 20
@@ -93,7 +108,7 @@ static char *cpuname[] = {
93108
"CORTEXA710",
94109
"FT2000",
95110
"CORTEXA76",
96-
"NEOVERSEV2"
111+
"NEOVERSEV2"
97112
};
98113

99114
static char *cpuname_lower[] = {
@@ -121,9 +136,13 @@ static char *cpuname_lower[] = {
121136
"cortexa710",
122137
"ft2000",
123138
"cortexa76",
124-
"neoversev2"
139+
"neoversev2"
125140
};
126141

142+
static int cpulowperf=0;
143+
static int cpumidperf=0;
144+
static int cpuhiperf=0;
145+
127146
int get_feature(char *search)
128147
{
129148

@@ -158,33 +177,108 @@ int get_feature(char *search)
158177
#endif
159178
return(0);
160179
}
161-
180+
static int cpusort(const void *model1, const void *model2)
181+
{
182+
return (*(int*)model2-*(int*)model1);
183+
}
162184

163185
int detect(void)
164186
{
165187

166188
#if defined( __linux ) || defined( __NetBSD__ )
167-
189+
int n,i,ii;
190+
int midr_el1;
191+
int implementer;
192+
int cpucap[1024];
193+
int cpucores[1024];
168194
FILE *infile;
169-
char buffer[512], *p, *cpu_part = NULL, *cpu_implementer = NULL;
195+
char cpupart[6],cpuimpl[6];
196+
char *cpu_impl=NULL,*cpu_pt=NULL;
197+
char buffer[2048], *p, *cpu_part = NULL, *cpu_implementer = NULL;
170198
p = (char *) NULL ;
171-
172-
infile = fopen("/proc/cpuinfo", "r");
173-
while (fgets(buffer, sizeof(buffer), infile)) {
174-
if ((cpu_part != NULL) && (cpu_implementer != NULL)) {
175-
break;
199+
cpulowperf=cpumidperf=cpuhiperf=0;
200+
for (i=0;i<1024;i++)cpucores[i]=0;
201+
n=0;
202+
infile = fopen("/sys/devices/system/cpu/possible", "r");
203+
if (!infile) {
204+
infile = fopen("/proc/cpuinfo", "r");
205+
while (fgets(buffer, sizeof(buffer), infile)) {
206+
if (!strncmp("processor", buffer, 9))
207+
n++;
176208
}
177-
178-
if ((cpu_part == NULL) && !strncmp("CPU part", buffer, 8)) {
179-
cpu_part = strchr(buffer, ':') + 2;
180-
cpu_part = strdup(cpu_part);
181-
} else if ((cpu_implementer == NULL) && !strncmp("CPU implementer", buffer, 15)) {
182-
cpu_implementer = strchr(buffer, ':') + 2;
183-
cpu_implementer = strdup(cpu_implementer);
209+
} else {
210+
fgets(buffer, sizeof(buffer), infile);
211+
sscanf(buffer,"0-%d",&n);
212+
n++;
213+
}
214+
fclose(infile);
215+
216+
cpu_implementer=NULL;
217+
for (i=0;i<n;i++){
218+
sprintf(buffer,"/sys/devices/system/cpu/cpu%d/regs/identification/midr_el1",i);
219+
infile= fopen(buffer,"r");
220+
if (!infile) {
221+
infile = fopen("/proc/cpuinfo", "r");
222+
for (ii=0;ii<n;ii++){
223+
cpu_part=NULL;cpu_implementer=NULL;
224+
while (fgets(buffer, sizeof(buffer), infile)) {
225+
if ((cpu_part != NULL) && (cpu_implementer != NULL)) {
226+
break;
227+
}
228+
229+
if ((cpu_part == NULL) && !strncmp("CPU part", buffer, 8)) {
230+
cpu_pt = strchr(buffer, ':') + 2;
231+
cpu_part = strdup(cpu_pt);
232+
cpucores[i]=strtol(cpu_part,NULL,0);
233+
234+
} else if ((cpu_implementer == NULL) && !strncmp("CPU implementer", buffer, 15)) {
235+
cpu_impl = strchr(buffer, ':') + 2;
236+
cpu_implementer = strdup(cpu_impl);
237+
}
238+
239+
}
240+
if (strstr(cpu_implementer, "0x41")) {
241+
if (cpucores[ii] >= 0xd4b) cpuhiperf++;
242+
else
243+
if (cpucores[ii] >= 0xd07) cpumidperf++;
244+
else cpulowperf++;
245+
}
246+
else cpulowperf++;
247+
}
248+
fclose(infile);
249+
break;
250+
} else {
251+
(void)fgets(buffer, sizeof(buffer), infile);
252+
midr_el1=strtoul(buffer,NULL,16);
253+
fclose(infile);
254+
implementer = (midr_el1 >> 24) & 0xFF;
255+
cpucores[i] = (midr_el1 >> 4) & 0xFFF;
256+
sprintf(buffer,"/sys/devices/system/cpu/cpu%d/cpu_capacity",i);
257+
infile= fopen(buffer,"r");
258+
if (!infile) {
259+
if (implementer== 65) {
260+
if (cpucores[i] >= 0xd4b) cpuhiperf++;
261+
else
262+
if (cpucores[i] >= 0xd07) cpumidperf++;
263+
else cpulowperf++;
264+
}
265+
else cpulowperf++;
266+
} else {
267+
(void)fgets(buffer, sizeof(buffer), infile);
268+
sscanf(buffer,"%d",&cpucap[i]);
269+
if (cpucap[i] >= 1000) cpuhiperf++;
270+
else
271+
if (cpucap[i] >= 500) cpumidperf++;
272+
else cpulowperf++;
273+
fclose(infile);
274+
}
184275
}
276+
sprintf(cpuimpl,"0x%2x",implementer);
277+
cpu_implementer=strdup(cpuimpl);
185278
}
186-
187-
fclose(infile);
279+
qsort(cpucores,1024,sizeof(int),cpusort);
280+
sprintf(cpupart,"0x%3x",cpucores[0]);
281+
cpu_part=strdup(cpupart);
188282
if(cpu_part != NULL && cpu_implementer != NULL) {
189283
// Arm
190284
if (strstr(cpu_implementer, "0x41")) {
@@ -219,7 +313,7 @@ int detect(void)
219313
else if (strstr(cpu_part, "0xd4f")) //NVIDIA Grace et al.
220314
return CPU_NEOVERSEV2;
221315
else if (strstr(cpu_part, "0xd0b"))
222-
return CPU_CORTEXA76;
316+
return CPU_CORTEXA76;
223317
}
224318
// Qualcomm
225319
else if (strstr(cpu_implementer, "0x51") && strstr(cpu_part, "0xc00"))
@@ -277,11 +371,20 @@ int detect(void)
277371
}
278372
#else
279373
#ifdef __APPLE__
374+
sysctlbyname("hw.ncpu",&value64,&length64,NULL,0);
375+
cpulowperf=value64;
376+
sysctlbyname("hw.nperflevels",&value64,&length64,NULL,0);
377+
if (value64 > 1) {
378+
sysctlbyname("hw.perflevel0.cpusperl",&value64,&length64,NULL,0);
379+
cpuhiperf=value64;
380+
sysctlbyname("hw.perflevel1.cpusperl",&value64,&length64,NULL,0);
381+
cpulowperf=value64;
382+
}
280383
sysctlbyname("hw.cpufamily",&value64,&length64,NULL,0);
281384
if (value64 ==131287967|| value64 == 458787763 ) return CPU_VORTEX; //A12/M1
282385
if (value64 == 3660830781) return CPU_VORTEX; //A15/M2
283-
if (value64 == 2271604202) return CPU_VORTEX; //A16/M3
284-
if (value64 == 1867590060) return CPU_VORTEX; //M4
386+
if (value64 == 2271604202) return CPU_VORTEX; //A16/M3
387+
if (value64 == 1867590060) return CPU_VORTEX; //M4
285388
#endif
286389
return CPU_ARMV8;
287390
#endif
@@ -331,10 +434,22 @@ int n=0;
331434
fclose(infile);
332435

333436
printf("#define NUM_CORES %d\n",n);
437+
if (cpulowperf >0)
438+
printf("#define NUM_CORES_LP %d\n",cpulowperf);
439+
if (cpumidperf >0)
440+
printf("#define NUM_CORES_MP %d\n",cpumidperf);
441+
if (cpuhiperf >0)
442+
printf("#define NUM_CORES_HP %d\n",cpuhiperf);
334443
#endif
335444
#ifdef __APPLE__
336445
sysctlbyname("hw.physicalcpu_max",&value,&length,NULL,0);
337446
printf("#define NUM_CORES %d\n",value);
447+
if (cpulowperf >0)
448+
printf("#define NUM_CORES_LP %d\n",cpulowperf);
449+
if (cpumidperf >0)
450+
printf("#define NUM_CORES_MP %d\n",cpumidperf);
451+
if (cpuhiperf >0)
452+
printf("#define NUM_CORES_HP %d\n",cpuhiperf);
338453
#endif
339454
}
340455

@@ -347,7 +462,6 @@ void get_cpuconfig(void)
347462
printf("#define ARMV8\n");
348463
printf("#define HAVE_NEON\n"); // This shouldn't be necessary
349464
printf("#define HAVE_VFPV4\n"); // This shouldn't be necessary
350-
351465
int d = detect();
352466
switch (d)
353467
{
@@ -402,8 +516,8 @@ void get_cpuconfig(void)
402516
break;
403517

404518
case CPU_NEOVERSEV1:
405-
printf("#define HAVE_SVE 1\n");
406-
case CPU_CORTEXA76:
519+
printf("#define HAVE_SVE 1\n");
520+
case CPU_CORTEXA76:
407521
printf("#define %s\n", cpuname[d]);
408522
printf("#define L1_CODE_SIZE 65536\n");
409523
printf("#define L1_CODE_LINESIZE 64\n");
@@ -431,32 +545,32 @@ void get_cpuconfig(void)
431545
printf("#define L2_ASSOCIATIVE 8\n");
432546
printf("#define DTB_DEFAULT_ENTRIES 48\n");
433547
printf("#define DTB_SIZE 4096\n");
434-
printf("#define HAVE_SVE 1\n");
548+
printf("#define HAVE_SVE 1\n");
435549
break;
436-
case CPU_NEOVERSEV2:
550+
case CPU_NEOVERSEV2:
437551
printf("#define ARMV9\n");
438-
printf("#define HAVE_SVE 1\n");
439-
printf("#define %s\n", cpuname[d]);
440-
printf("#define L1_CODE_SIZE 65536\n");
441-
printf("#define L1_CODE_LINESIZE 64\n");
442-
printf("#define L1_CODE_ASSOCIATIVE 4\n");
443-
printf("#define L1_DATA_SIZE 65536\n");
444-
printf("#define L1_DATA_LINESIZE 64\n");
445-
printf("#define L1_DATA_ASSOCIATIVE 4\n");
446-
printf("#define L2_SIZE 1048576\n");
447-
printf("#define L2_LINESIZE 64\n");
448-
printf("#define L2_ASSOCIATIVE 8\n");
449-
// L1 Data TLB = 48 entries
450-
// L2 Data TLB = 2048 entries
451-
printf("#define DTB_DEFAULT_ENTRIES 48\n");
452-
printf("#define DTB_SIZE 4096\n"); // Set to 4096 for symmetry with other configs.
453-
break;
552+
printf("#define HAVE_SVE 1\n");
553+
printf("#define %s\n", cpuname[d]);
554+
printf("#define L1_CODE_SIZE 65536\n");
555+
printf("#define L1_CODE_LINESIZE 64\n");
556+
printf("#define L1_CODE_ASSOCIATIVE 4\n");
557+
printf("#define L1_DATA_SIZE 65536\n");
558+
printf("#define L1_DATA_LINESIZE 64\n");
559+
printf("#define L1_DATA_ASSOCIATIVE 4\n");
560+
printf("#define L2_SIZE 1048576\n");
561+
printf("#define L2_LINESIZE 64\n");
562+
printf("#define L2_ASSOCIATIVE 8\n");
563+
// L1 Data TLB = 48 entries
564+
// L2 Data TLB = 2048 entries
565+
printf("#define DTB_DEFAULT_ENTRIES 48\n");
566+
printf("#define DTB_SIZE 4096\n"); // Set to 4096 for symmetry with other configs.
567+
break;
454568
case CPU_CORTEXA510:
455569
case CPU_CORTEXA710:
456570
case CPU_CORTEXX1:
457571
case CPU_CORTEXX2:
458572
printf("#define ARMV9\n");
459-
printf("#define HAVE_SVE 1\n");
573+
printf("#define HAVE_SVE 1\n");
460574
printf("#define %s\n", cpuname[d]);
461575
printf("#define L1_CODE_SIZE 65536\n");
462576
printf("#define L1_CODE_LINESIZE 64\n");
@@ -559,8 +673,6 @@ void get_cpuconfig(void)
559673
case CPU_VORTEX:
560674
printf("#define VORTEX \n");
561675
#ifdef __APPLE__
562-
sysctlbyname("hw.cpufamily",&value64,&length64,NULL,0);
563-
if (value64 == 1867590060) printf("#define HAVE_SME 1\n");; //M4
564676
sysctlbyname("hw.l1icachesize",&value64,&length64,NULL,0);
565677
printf("#define L1_CODE_SIZE %lld \n",value64);
566678
sysctlbyname("hw.cachelinesize",&value64,&length64,NULL,0);
@@ -575,7 +687,7 @@ void get_cpuconfig(void)
575687
break;
576688
case CPU_A64FX:
577689
printf("#define A64FX\n");
578-
printf("#define HAVE_SVE 1\n");
690+
printf("#define HAVE_SVE 1\n");
579691
printf("#define L1_CODE_SIZE 65535\n");
580692
printf("#define L1_DATA_SIZE 65535\n");
581693
printf("#define L1_DATA_LINESIZE 256\n");

0 commit comments

Comments
 (0)