[AArch64] Add support for -mcpu=gb10. (#146515)

rj-jesus · web-flow · commit 84e54515bc4e · 2025-07-07T11:14:26.000+01:00
This patch adds support for -mcpu=gb10 (NVIDIA GB10). This is a big.LITTLE cluster of Cortex-X925 and Cortex-A725 cores. The appropriate MIDR numbers are added to detect them in -mcpu=native. We did not add an -mcpu=cortex-x925.cortex-a725 option because GB10 does include the crypto instructions which we want on by default, and the current convention is to not enable such extensions for Arm Cortex cores in -mcpu where they are optional in the IP. Relevant GCC patch: https://gcc.gnu.org/pipermail/gcc-patches/2025-June/687005.html
diff --git a/clang/test/Driver/print-enabled-extensions/aarch64-gb10.c b/clang/test/Driver/print-enabled-extensions/aarch64-gb10.c
@@ -0,0 +1,69 @@
+// REQUIRES: aarch64-registered-target
+// RUN: %clang --target=aarch64 --print-enabled-extensions -mcpu=gb10 | FileCheck --strict-whitespace --implicit-check-not=FEAT_ %s
+
+// CHECK: Extensions enabled for the given AArch64 target
+// CHECK-EMPTY:
+// CHECK-NEXT:     Architecture Feature(s)                                Description
+// CHECK-NEXT:     FEAT_AES, FEAT_PMULL                                   Enable AES support
+// CHECK-NEXT:     FEAT_AMUv1                                             Enable Armv8.4-A Activity Monitors extension
+// CHECK-NEXT:     FEAT_AMUv1p1                                           Enable Armv8.6-A Activity Monitors Virtualization support
+// CHECK-NEXT:     FEAT_AdvSIMD                                           Enable Advanced SIMD instructions
+// CHECK-NEXT:     FEAT_BF16                                              Enable BFloat16 Extension
+// CHECK-NEXT:     FEAT_BTI                                               Enable Branch Target Identification
+// CHECK-NEXT:     FEAT_CCIDX                                             Enable Armv8.3-A Extend of the CCSIDR number of sets
+// CHECK-NEXT:     FEAT_CRC32                                             Enable Armv8.0-A CRC-32 checksum instructions
+// CHECK-NEXT:     FEAT_CSV2_2                                            Enable architectural speculation restriction
+// CHECK-NEXT:     FEAT_DIT                                               Enable Armv8.4-A Data Independent Timing instructions
+// CHECK-NEXT:     FEAT_DPB                                               Enable Armv8.2-A data Cache Clean to Point of Persistence
+// CHECK-NEXT:     FEAT_DPB2                                              Enable Armv8.5-A Cache Clean to Point of Deep Persistence
+// CHECK-NEXT:     FEAT_DotProd                                           Enable dot product support
+// CHECK-NEXT:     FEAT_ECV                                               Enable enhanced counter virtualization extension
+// CHECK-NEXT:     FEAT_ETE                                               Enable Embedded Trace Extension
+// CHECK-NEXT:     FEAT_FCMA                                              Enable Armv8.3-A Floating-point complex number support
+// CHECK-NEXT:     FEAT_FGT                                               Enable fine grained virtualization traps extension
+// CHECK-NEXT:     FEAT_FHM                                               Enable FP16 FML instructions
+// CHECK-NEXT:     FEAT_FP                                                Enable Armv8.0-A Floating Point Extensions
+// CHECK-NEXT:     FEAT_FP16                                              Enable half-precision floating-point data processing
+// CHECK-NEXT:     FEAT_FPAC                                              Enable Armv8.3-A Pointer Authentication Faulting enhancement
+// CHECK-NEXT:     FEAT_FRINTTS                                           Enable FRInt[32|64][Z|X] instructions that round a floating-point number to an integer (in FP format) forcing it to fit into a 32- or 64-bit int
+// CHECK-NEXT:     FEAT_FlagM                                             Enable Armv8.4-A Flag Manipulation instructions
+// CHECK-NEXT:     FEAT_FlagM2                                            Enable alternative NZCV format for floating point comparisons
+// CHECK-NEXT:     FEAT_HCX                                               Enable Armv8.7-A HCRX_EL2 system register
+// CHECK-NEXT:     FEAT_I8MM                                              Enable Matrix Multiply Int8 Extension
+// CHECK-NEXT:     FEAT_JSCVT                                             Enable Armv8.3-A JavaScript FP conversion instructions
+// CHECK-NEXT:     FEAT_LOR                                               Enable Armv8.1-A Limited Ordering Regions extension
+// CHECK-NEXT:     FEAT_LRCPC                                             Enable support for RCPC extension
+// CHECK-NEXT:     FEAT_LRCPC2                                            Enable Armv8.4-A RCPC instructions with Immediate Offsets
+// CHECK-NEXT:     FEAT_LSE                                               Enable Armv8.1-A Large System Extension (LSE) atomic instructions
+// CHECK-NEXT:     FEAT_LSE2                                              Enable Armv8.4-A Large System Extension 2 (LSE2) atomicity rules
+// CHECK-NEXT:     FEAT_MPAM                                              Enable Armv8.4-A Memory system Partitioning and Monitoring extension
+// CHECK-NEXT:     FEAT_MTE, FEAT_MTE2                                    Enable Memory Tagging Extension
+// CHECK-NEXT:     FEAT_NV, FEAT_NV2                                      Enable Armv8.4-A Nested Virtualization Enchancement
+// CHECK-NEXT:     FEAT_PAN                                               Enable Armv8.1-A Privileged Access-Never extension
+// CHECK-NEXT:     FEAT_PAN2                                              Enable Armv8.2-A PAN s1e1R and s1e1W Variants
+// CHECK-NEXT:     FEAT_PAuth                                             Enable Armv8.3-A Pointer Authentication extension
+// CHECK-NEXT:     FEAT_PMUv3                                             Enable Armv8.0-A PMUv3 Performance Monitors extension
+// CHECK-NEXT:     FEAT_RAS, FEAT_RASv1p1                                 Enable Armv8.0-A Reliability, Availability and Serviceability Extensions
+// CHECK-NEXT:     FEAT_RDM                                               Enable Armv8.1-A Rounding Double Multiply Add/Subtract instructions
+// CHECK-NEXT:     FEAT_SB                                                Enable Armv8.5-A Speculation Barrier
+// CHECK-NEXT:     FEAT_SEL2                                              Enable Armv8.4-A Secure Exception Level 2 extension
+// CHECK-NEXT:     FEAT_SHA1, FEAT_SHA256                                 Enable SHA1 and SHA256 support
+// CHECK-NEXT:     FEAT_SHA3, FEAT_SHA512                                 Enable SHA512 and SHA3 support
+// CHECK-NEXT:     FEAT_SM4, FEAT_SM3                                     Enable SM3 and SM4 support
+// CHECK-NEXT:     FEAT_SPE                                               Enable Statistical Profiling extension
+// CHECK-NEXT:     FEAT_SPECRES                                           Enable Armv8.5-A execution and data prediction invalidation instructions
+// CHECK-NEXT:     FEAT_SPEv1p2                                           Enable extra register in the Statistical Profiling Extension
+// CHECK-NEXT:     FEAT_SSBS, FEAT_SSBS2                                  Enable Speculative Store Bypass Safe bit
+// CHECK-NEXT:     FEAT_SVE                                               Enable Scalable Vector Extension (SVE) instructions
+// CHECK-NEXT:     FEAT_SVE2                                              Enable Scalable Vector Extension 2 (SVE2) instructions
+// CHECK-NEXT:     FEAT_SVE_AES, FEAT_SVE_PMULL128                        Enable SVE AES and quadword SVE polynomial multiply instructions
+// CHECK-NEXT:     FEAT_SVE_BitPerm                                       Enable bit permutation SVE2 instructions
+// CHECK-NEXT:     FEAT_SVE_SHA3                                          Enable SVE SHA3 instructions
+// CHECK-NEXT:     FEAT_SVE_SM4                                           Enable SM4 SVE2 instructions
+// CHECK-NEXT:     FEAT_TLBIOS, FEAT_TLBIRANGE                            Enable Armv8.4-A TLB Range and Maintenance instructions
+// CHECK-NEXT:     FEAT_TRBE                                              Enable Trace Buffer Extension
+// CHECK-NEXT:     FEAT_TRF                                               Enable Armv8.4-A Trace extension
+// CHECK-NEXT:     FEAT_UAO                                               Enable Armv8.2-A UAO PState
+// CHECK-NEXT:     FEAT_VHE                                               Enable Armv8.1-A Virtual Host extension
+// CHECK-NEXT:     FEAT_WFxT                                              Enable Armv8.7-A WFET and WFIT instruction
+// CHECK-NEXT:     FEAT_XS                                                Enable Armv8.7-A limited-TLB-maintenance instruction
diff --git a/clang/test/Misc/target-invalid-cpu-note/aarch64.c b/clang/test/Misc/target-invalid-cpu-note/aarch64.c
@@ -75,6 +75,7 @@
 // CHECK-SAME: {{^}}, exynos-m5
 // CHECK-SAME: {{^}}, falkor
 // CHECK-SAME: {{^}}, fujitsu-monaka
+// CHECK-SAME: {{^}}, gb10
 // CHECK-SAME: {{^}}, generic
 // CHECK-SAME: {{^}}, grace
 // CHECK-SAME: {{^}}, kryo
diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td
@@ -1117,6 +1117,7 @@ def ProcessorFeatures {
                                      FeatureDotProd, FeatureFPARMv8, FeatureMatMulInt8,
                                      FeatureSSBS, FeatureCCIDX,
                                      FeatureJS, FeatureLSE, FeatureRAS, FeatureRCPC, FeatureRDM];
+  list<SubtargetFeature> GB10 = !listconcat(X925, [FeatureSVEAES, FeatureSVESHA3, FeatureSVE2SM4]);
   list<SubtargetFeature> Grace = !listconcat(NeoverseV2, [FeatureSVE2SM4, FeatureSVEAES, FeatureSVESHA3]);
 
   // ETE and TRBE are future architecture extensions. We temporarily enable them
@@ -1203,6 +1204,8 @@ def : ProcessorModel<"cortex-x4", NeoverseV2Model, ProcessorFeatures.X4,
                      [TuneX4]>;
 def : ProcessorModel<"cortex-x925", NeoverseV2Model, ProcessorFeatures.X925,
                      [TuneX925]>;
+def : ProcessorModel<"gb10", NeoverseV2Model, ProcessorFeatures.GB10,
+                     [TuneX925]>;
 def : ProcessorModel<"grace", NeoverseV2Model, ProcessorFeatures.Grace,
                      [TuneNeoverseV2]>;
 def : ProcessorModel<"neoverse-e1", CortexA53Model,
diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
@@ -176,25 +176,43 @@ StringRef sys::detail::getHostCPUNameForARM(StringRef ProcCpuinfoContent) {
   SmallVector<StringRef, 32> Lines;
   ProcCpuinfoContent.split(Lines, '\n');
 
-  // Look for the CPU implementer line.
+  // Look for the CPU implementer and hardware lines, and store the CPU part
+  // numbers found.
   StringRef Implementer;
   StringRef Hardware;
-  StringRef Part;
+  SmallVector<StringRef, 32> Parts;
   for (StringRef Line : Lines) {
     if (Line.consume_front("CPU implementer"))
       Implementer = Line.ltrim("\t :");
     else if (Line.consume_front("Hardware"))
       Hardware = Line.ltrim("\t :");
     else if (Line.consume_front("CPU part"))
-      Part = Line.ltrim("\t :");
+      Parts.emplace_back(Line.ltrim("\t :"));
   }
 
+  // Last `Part' seen, in case we don't analyse all `Parts' parsed.
+  StringRef Part = Parts.empty() ? StringRef() : Parts.back();
+
+  // Remove duplicate `Parts'.
+  llvm::sort(Parts);
+  Parts.erase(llvm::unique(Parts), Parts.end());
+
+  auto MatchBigLittle = [](auto const &Parts, StringRef Big, StringRef Little) {
+    if (Parts.size() == 2)
+      return (Parts[0] == Big && Parts[1] == Little) ||
+             (Parts[1] == Big && Parts[0] == Little);
+    return false;
+  };
+
   if (Implementer == "0x41") { // ARM Ltd.
     // MSM8992/8994 may give cpu part for the core that the kernel is running on,
     // which is undeterministic and wrong. Always return cortex-a53 for these SoC.
     if (Hardware.ends_with("MSM8994") || Hardware.ends_with("MSM8996"))
       return "cortex-a53";
 
+    // Detect big.LITTLE systems.
+    if (MatchBigLittle(Parts, "0xd85", "0xd87"))
+      return "cortex-x925";
 
     // The CPU part is a 3 digit hexadecimal number with a 0x prefix. The
     // values correspond to the "Part number" in the CP15/c0 register. The
diff --git a/llvm/unittests/TargetParser/Host.cpp b/llvm/unittests/TargetParser/Host.cpp
@@ -122,6 +122,14 @@ TEST(getLinuxHostCPUName, AArch64) {
   EXPECT_EQ(sys::detail::getHostCPUNameForARM("CPU implementer : 0x41\n"
                                               "CPU part        : 0xd48"),
             "cortex-x2");
+  EXPECT_EQ(sys::detail::getHostCPUNameForARM("CPU implementer : 0x41\n"
+                                              "CPU part        : 0xd85\n"
+                                              "CPU part        : 0xd87"),
+            "cortex-x925");
+  EXPECT_EQ(sys::detail::getHostCPUNameForARM("CPU implementer : 0x41\n"
+                                              "CPU part        : 0xd87\n"
+                                              "CPU part        : 0xd85"),
+            "cortex-x925");
   EXPECT_EQ(sys::detail::getHostCPUNameForARM("CPU implementer : 0x51\n"
                                               "CPU part        : 0xc00"),
             "falkor");
diff --git a/llvm/unittests/TargetParser/TargetParserTest.cpp b/llvm/unittests/TargetParser/TargetParserTest.cpp
@@ -1164,6 +1164,7 @@ INSTANTIATE_TEST_SUITE_P(
                       AArch64CPUTestParams("a64fx", "armv8.2-a"),
                       AArch64CPUTestParams("fujitsu-monaka", "armv9.3-a"),
                       AArch64CPUTestParams("carmel", "armv8.2-a"),
+                      AArch64CPUTestParams("gb10", "armv9.2-a"),
                       AArch64CPUTestParams("grace", "armv9-a"),
                       AArch64CPUTestParams("olympus", "armv9.2-a"),
                       AArch64CPUTestParams("saphira", "armv8.4-a"),
@@ -1260,7 +1261,7 @@ INSTANTIATE_TEST_SUITE_P(
     AArch64CPUAliasTestParams::PrintToStringParamName);
 
 // Note: number of CPUs includes aliases.
-static constexpr unsigned NumAArch64CPUArchs = 90;
+static constexpr unsigned NumAArch64CPUArchs = 91;
 
 TEST(TargetParserTest, testAArch64CPUArchList) {
   SmallVector<StringRef, NumAArch64CPUArchs> List;