llvm · SergeyShch01 · Jul 8, 2025 · Jul 9, 2025 · Jul 9, 2025 · Jul 9, 2025
diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -2229,6 +2229,7 @@ class LSRInstance {
   void NarrowSearchSpaceByDeletingCostlyFormulas();
   void NarrowSearchSpaceByPickingWinnerRegs();
   void NarrowSearchSpaceUsingHeuristics();
+  bool SortLSRUses();
 
   void SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
                     Cost &SolutionCost,
@@ -5368,6 +5369,45 @@ void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
     NarrowSearchSpaceByPickingWinnerRegs();
 }
 
+/// Sort LSRUses to address side effects of compile time optimization done in
+/// SolveRecurse which filters out formulae not including required registers.
+/// Such optimization makes the found best solution sensitive to the order
+/// of LSRUses processing, hence it's important to ensure that that order
+/// isn't random to avoid fluctuations and sub-optimal results.
+///
+/// Also check that all LSRUses have formulae as otherwise the situation is
+/// unsolvable.
+bool LSRInstance::SortLSRUses() {
+  SmallVector<LSRUse *, 16> NewOrder;
+  for (LSRUse &LU : Uses) {
+    if (LU.Formulae.empty()) {
+      return false;
+    }
+    NewOrder.push_back(&LU);
+  }
+
+  stable_sort(NewOrder, [](const LSRUse *L, const LSRUse *R) {
+    auto CalcKey = [](const LSRUse *LU) {
+      // LSRUses w/ many registers and formulae go first avoid too big
+      // reduction of considered solutions count
+      return std::tuple(LU->Regs.size(), LU->Formulae.size(), LU->Kind,
+                        LU->MinOffset.getKnownMinValue(),
+                        LU->MaxOffset.getKnownMinValue(),
+                        LU->AllFixupsOutsideLoop, LU->RigidFormula);
+    };
+    return CalcKey(L) > CalcKey(R);
+  });
+
+  SmallVector<LSRUse, 4> NewUses;
+  for (LSRUse *LU : NewOrder)
+    NewUses.push_back(std::move(*LU));
+  Uses = std::move(NewUses);
+
+  LLVM_DEBUG(dbgs() << "\nAfter sorting:\n"; print_uses(dbgs()));
+
+  return true;
+}
+
 /// This is the recursive solver.
 void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
                                Cost &SolutionCost,
@@ -5387,6 +5427,10 @@ void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
 
   const LSRUse &LU = Uses[Workspace.size()];
 
+  assert(!LU.Formulae.empty() &&
+         "LSRUse w/o formulae leads to unsolvable situation so it"
+         "shouldn't be here");
+
   // If this use references any register that's already a part of the
   // in-progress solution, consider it a requirement that a formula must
   // reference that register in order to be considered. This prunes out
@@ -5398,54 +5442,73 @@ void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
 
   SmallPtrSet<const SCEV *, 16> NewRegs;
   Cost NewCost(L, SE, TTI, AMK);
-  for (const Formula &F : LU.Formulae) {
-    // Ignore formulae which may not be ideal in terms of register reuse of
-    // ReqRegs.  The formula should use all required registers before
-    // introducing new ones.
-    // This can sometimes (notably when trying to favour postinc) lead to
-    // sub-optimial decisions. There it is best left to the cost modelling to
-    // get correct.
-    if (AMK != TTI::AMK_PostIndexed || LU.Kind != LSRUse::Address) {
-      int NumReqRegsToFind = std::min(F.getNumRegs(), ReqRegs.size());
-      for (const SCEV *Reg : ReqRegs) {
-        if ((F.ScaledReg && F.ScaledReg == Reg) ||
-            is_contained(F.BaseRegs, Reg)) {
-          --NumReqRegsToFind;
-          if (NumReqRegsToFind == 0)
-            break;
+  bool FormulaeTested = false;
+  unsigned NumReqRegsToIgnore = 0;
+
+  while (!FormulaeTested) {
+    assert(
+        !NumReqRegsToIgnore ||
+        NumReqRegsToIgnore < ReqRegs.size() &&
+            "at least one formulae should have at least one required register");
+
+    for (const Formula &F : LU.Formulae) {
+      // ReqRegs. The formula should use required registers before
+      // introducing new ones. Firstly try the most aggressive option
+      // (when maximum of required registers are used) and then gradually make
+      // it weaker if all formulae don't satisfy this requirement.
+      //
+      // This can sometimes (notably when trying to favour postinc) lead to
+      // sub-optimal decisions. There it is best left to the cost modeling to
+      // get correct.
+      if (!ReqRegs.empty() &&
+          (AMK != TTI::AMK_PostIndexed || LU.Kind != LSRUse::Address)) {
+        unsigned NumReqRegsToFind = std::min(F.getNumRegs(), ReqRegs.size());
+        bool ReqRegsFound = false;
+        for (const SCEV *Reg : ReqRegs) {
+          if ((F.ScaledReg && F.ScaledReg == Reg) ||
+              is_contained(F.BaseRegs, Reg)) {
+            ReqRegsFound = true;
+            if (--NumReqRegsToFind == NumReqRegsToIgnore)
+              break;
+          }
+        }
+        if (!ReqRegsFound || NumReqRegsToFind != NumReqRegsToIgnore) {
+          continue;
         }
       }
-      if (NumReqRegsToFind != 0) {
-        // If none of the formulae satisfied the required registers, then we could
-        // clear ReqRegs and try again. Currently, we simply give up in this case.
-        continue;
-      }
-    }
 
-    // Evaluate the cost of the current formula. If it's already worse than
-    // the current best, prune the search at that point.
-    NewCost = CurCost;
-    NewRegs = CurRegs;
-    NewCost.RateFormula(F, NewRegs, VisitedRegs, LU);
-    if (NewCost.isLess(SolutionCost)) {
-      Workspace.push_back(&F);
-      if (Workspace.size() != Uses.size()) {
-        SolveRecurse(Solution, SolutionCost, Workspace, NewCost,
-                     NewRegs, VisitedRegs);
-        if (F.getNumRegs() == 1 && Workspace.size() == 1)
-          VisitedRegs.insert(F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]);
-      } else {
-        LLVM_DEBUG(dbgs() << "New best at "; NewCost.print(dbgs());
-                   dbgs() << ".\nRegs:\n";
-                   for (const SCEV *S : NewRegs) dbgs()
-                      << "- " << *S << "\n";
-                   dbgs() << '\n');
-
-        SolutionCost = NewCost;
-        Solution = Workspace;
+      // Evaluate the cost of the current formula. If it's already worse than
+      // the current best, prune the search at that point.
+      FormulaeTested = true;
+      NewCost = CurCost;
+      NewRegs = CurRegs;
+      NewCost.RateFormula(F, NewRegs, VisitedRegs, LU);
+      if (NewCost.isLess(SolutionCost)) {
+        Workspace.push_back(&F);
+        if (Workspace.size() != Uses.size()) {
+          SolveRecurse(Solution, SolutionCost, Workspace, NewCost, NewRegs,
+                       VisitedRegs);
+          if (F.getNumRegs() == 1 && Workspace.size() == 1)
+            VisitedRegs.insert(F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]);
+        } else {
+          LLVM_DEBUG({
+            dbgs() << "New best at ";
+            NewCost.print(dbgs());
+            dbgs() << ".\nRegs:\n";
+            for (const SCEV *S : NewRegs)
+              dbgs() << "- " << *S << "\n";
+            dbgs() << '\n';
+          });
+          SolutionCost = NewCost;
+          Solution = Workspace;
+        }
+        Workspace.pop_back();
       }
-      Workspace.pop_back();
     }
+
+    // none of formulae has necessary number of required registers - then
+    // make the requirement weaker
+    NumReqRegsToIgnore++;
   }
 }
 
@@ -6180,7 +6243,11 @@ LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE,
   NarrowSearchSpaceUsingHeuristics();
 
   SmallVector<const Formula *, 8> Solution;
-  Solve(Solution);
+  bool LSRUsesConsistent = SortLSRUses();
+
+  if (LSRUsesConsistent) {
+    Solve(Solution);
+  }
 
   // Release memory that is no longer needed.
   Factors.clear();

diff --git a/llvm/test/CodeGen/AArch64/aarch64-p2align-max-bytes.ll b/llvm/test/CodeGen/AArch64/aarch64-p2align-max-bytes.ll
@@ -16,14 +16,14 @@ define i32 @a(i32 %x, ptr nocapture readonly %y, ptr nocapture readonly %z) {
 ; CHECK-IMPLICIT:    .p2align 5
 ; CHECK-NEXT:  .LBB0_8: // %for.body
 ; CHECK-OBJ;Disassembly of section .text:
-; CHECK-OBJ:               88: 8b0a002a      add
+; CHECK-OBJ:               88: 8b0b002b      add
 ; CHECK-OBJ-IMPLICIT-NEXT: 8c: d503201f      nop
 ; CHECK-OBJ-IMPLICIT-NEXT: 90: d503201f      nop
 ; CHECK-OBJ-IMPLICIT-NEXT: 94: d503201f      nop
 ; CHECK-OBJ-IMPLICIT-NEXT: 98: d503201f      nop
 ; CHECK-OBJ-IMPLICIT-NEXT: 9c: d503201f      nop
-; CHECK-OBJ-IMPLICIT-NEXT: a0: b840454b      ldr
-; CHECK-OBJ-EXPLICIT-NEXT: 8c: b840454b      ldr
+; CHECK-OBJ-IMPLICIT-NEXT: a0: b8404569      ldr
+; CHECK-OBJ-EXPLICIT-NEXT: 8c: b8404569      ldr
 entry:
   %cmp10 = icmp sgt i32 %x, 0
   br i1 %cmp10, label %for.body.preheader, label %for.cond.cleanup

diff --git a/llvm/test/CodeGen/AArch64/machine-combiner-copy.ll b/llvm/test/CodeGen/AArch64/machine-combiner-copy.ll
@@ -33,17 +33,17 @@ define void @fma_dup_f16(ptr noalias nocapture noundef readonly %A, half noundef
 ; CHECK-NEXT:    cmp x9, x8
 ; CHECK-NEXT:    b.eq .LBB0_8
 ; CHECK-NEXT:  .LBB0_6: // %for.body.preheader1
-; CHECK-NEXT:    lsl x10, x9, #1
+; CHECK-NEXT:    lsl x11, x9, #1
 ; CHECK-NEXT:    sub x8, x8, x9
-; CHECK-NEXT:    add x9, x1, x10
-; CHECK-NEXT:    add x10, x0, x10
+; CHECK-NEXT:    add x10, x1, x11
+; CHECK-NEXT:    add x11, x0, x11
 ; CHECK-NEXT:  .LBB0_7: // %for.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldr h1, [x10], #2
-; CHECK-NEXT:    ldr h2, [x9]
+; CHECK-NEXT:    ldr h1, [x11], #2
+; CHECK-NEXT:    ldr h2, [x10]
 ; CHECK-NEXT:    subs x8, x8, #1
 ; CHECK-NEXT:    fmadd h1, h1, h0, h2
-; CHECK-NEXT:    str h1, [x9], #2
+; CHECK-NEXT:    str h1, [x10], #2
 ; CHECK-NEXT:    b.ne .LBB0_7
 ; CHECK-NEXT:  .LBB0_8: // %for.cond.cleanup
 ; CHECK-NEXT:    ret

diff --git a/llvm/test/CodeGen/AArch64/machine-licm-sub-loop.ll b/llvm/test/CodeGen/AArch64/machine-licm-sub-loop.ll
@@ -36,23 +36,23 @@ define void @foo(i32 noundef %limit, ptr %out, ptr %y) {
 ; CHECK-NEXT:  .LBB0_5: // %vector.ph
 ; CHECK-NEXT:    // in Loop: Header=BB0_3 Depth=1
 ; CHECK-NEXT:    dup v0.8h, w15
-; CHECK-NEXT:    mov x16, x14
-; CHECK-NEXT:    mov x17, x13
-; CHECK-NEXT:    mov x18, x12
+; CHECK-NEXT:    mov x16, x12
+; CHECK-NEXT:    mov x17, x14
+; CHECK-NEXT:    mov x18, x13
 ; CHECK-NEXT:  .LBB0_6: // %vector.body
 ; CHECK-NEXT:    // Parent Loop BB0_3 Depth=1
 ; CHECK-NEXT:    // => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    ldp q1, q4, [x16, #-16]
-; CHECK-NEXT:    subs x18, x18, #16
-; CHECK-NEXT:    ldp q3, q2, [x17, #-32]
-; CHECK-NEXT:    add x16, x16, #32
-; CHECK-NEXT:    ldp q6, q5, [x17]
+; CHECK-NEXT:    ldp q1, q4, [x17, #-16]
+; CHECK-NEXT:    subs x16, x16, #16
+; CHECK-NEXT:    ldp q3, q2, [x18, #-32]
+; CHECK-NEXT:    add x17, x17, #32
+; CHECK-NEXT:    ldp q6, q5, [x18]
 ; CHECK-NEXT:    smlal2 v2.4s, v0.8h, v1.8h
 ; CHECK-NEXT:    smlal v3.4s, v0.4h, v1.4h
 ; CHECK-NEXT:    smlal2 v5.4s, v0.8h, v4.8h
 ; CHECK-NEXT:    smlal v6.4s, v0.4h, v4.4h
-; CHECK-NEXT:    stp q3, q2, [x17, #-32]
-; CHECK-NEXT:    stp q6, q5, [x17], #64
+; CHECK-NEXT:    stp q3, q2, [x18, #-32]
+; CHECK-NEXT:    stp q6, q5, [x18], #64
 ; CHECK-NEXT:    b.ne .LBB0_6
 ; CHECK-NEXT:  // %bb.7: // %middle.block
 ; CHECK-NEXT:    // in Loop: Header=BB0_3 Depth=1

diff --git a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
@@ -1666,33 +1666,33 @@ define void @zext_v8i8_to_v8i64_with_add_in_sequence_in_loop(ptr %src, ptr %dst)
 ; CHECK-NEXT:    ldr q0, [x8, lCPI17_0@PAGEOFF]
 ; CHECK-NEXT:  Lloh21:
 ; CHECK-NEXT:    ldr q1, [x9, lCPI17_1@PAGEOFF]
-; CHECK-NEXT:    add x8, x1, #64
-; CHECK-NEXT:    add x9, x0, #8
+; CHECK-NEXT:    add x8, x0, #8
+; CHECK-NEXT:    add x9, x1, #64
 ; CHECK-NEXT:  LBB17_1: ; %loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldp d2, d3, [x9, #-8]
+; CHECK-NEXT:    ldp d2, d3, [x8, #-8]
 ; CHECK-NEXT:    subs x10, x10, #16
-; CHECK-NEXT:    ldp q7, q5, [x8, #-32]
-; CHECK-NEXT:    add x9, x9, #16
-; CHECK-NEXT:    ldp q17, q6, [x8, #-64]
+; CHECK-NEXT:    ldp q7, q5, [x9, #-32]
+; CHECK-NEXT:    add x8, x8, #16
+; CHECK-NEXT:    ldp q17, q6, [x9, #-64]
 ; CHECK-NEXT:    tbl.16b v4, { v2 }, v1
 ; CHECK-NEXT:    tbl.16b v2, { v2 }, v0
 ; CHECK-NEXT:    tbl.16b v16, { v3 }, v1
 ; CHECK-NEXT:    tbl.16b v3, { v3 }, v0
 ; CHECK-NEXT:    uaddw2.2d v5, v5, v4
 ; CHECK-NEXT:    uaddw2.2d v6, v6, v2
 ; CHECK-NEXT:    uaddw.2d v4, v7, v4
-; CHECK-NEXT:    ldp q18, q7, [x8, #32]
+; CHECK-NEXT:    ldp q18, q7, [x9, #32]
 ; CHECK-NEXT:    uaddw.2d v2, v17, v2
-; CHECK-NEXT:    stp q4, q5, [x8, #-32]
+; CHECK-NEXT:    stp q4, q5, [x9, #-32]
 ; CHECK-NEXT:    uaddw2.2d v5, v7, v16
-; CHECK-NEXT:    stp q2, q6, [x8, #-64]
+; CHECK-NEXT:    stp q2, q6, [x9, #-64]
 ; CHECK-NEXT:    uaddw.2d v16, v18, v16
-; CHECK-NEXT:    ldp q7, q6, [x8]
-; CHECK-NEXT:    stp q16, q5, [x8, #32]
+; CHECK-NEXT:    ldp q7, q6, [x9]
+; CHECK-NEXT:    stp q16, q5, [x9, #32]
 ; CHECK-NEXT:    uaddw2.2d v4, v6, v3
 ; CHECK-NEXT:    uaddw.2d v2, v7, v3
-; CHECK-NEXT:    stp q2, q4, [x8], #128
+; CHECK-NEXT:    stp q2, q4, [x9], #128
 ; CHECK-NEXT:    b.ne LBB17_1
 ; CHECK-NEXT:  ; %bb.2: ; %exit
 ; CHECK-NEXT:    ret
@@ -1708,31 +1708,31 @@ define void @zext_v8i8_to_v8i64_with_add_in_sequence_in_loop(ptr %src, ptr %dst)
 ; CHECK-BE-NEXT:    adrp x9, .LCPI17_1
 ; CHECK-BE-NEXT:    add x9, x9, :lo12:.LCPI17_1
 ; CHECK-BE-NEXT:    ld1 { v1.16b }, [x9]
-; CHECK-BE-NEXT:    add x9, x1, #64
-; CHECK-BE-NEXT:    add x10, x0, #8
+; CHECK-BE-NEXT:    add x9, x0, #8
+; CHECK-BE-NEXT:    add x10, x1, #64
 ; CHECK-BE-NEXT:  .LBB17_1: // %loop
 ; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-BE-NEXT:    ld1 { v2.8b }, [x10]
-; CHECK-BE-NEXT:    sub x11, x10, #8
-; CHECK-BE-NEXT:    add x15, x9, #32
+; CHECK-BE-NEXT:    ld1 { v2.8b }, [x9]
+; CHECK-BE-NEXT:    sub x11, x9, #8
+; CHECK-BE-NEXT:    add x15, x10, #32
 ; CHECK-BE-NEXT:    ld1 { v3.8b }, [x11]
 ; CHECK-BE-NEXT:    ld1 { v16.2d }, [x15]
-; CHECK-BE-NEXT:    sub x11, x9, #64
-; CHECK-BE-NEXT:    sub x12, x9, #32
-; CHECK-BE-NEXT:    ld1 { v6.2d }, [x9]
+; CHECK-BE-NEXT:    sub x11, x10, #64
+; CHECK-BE-NEXT:    sub x12, x10, #32
+; CHECK-BE-NEXT:    ld1 { v6.2d }, [x10]
 ; CHECK-BE-NEXT:    ld1 { v21.2d }, [x11]
 ; CHECK-BE-NEXT:    tbl v4.16b, { v2.16b }, v1.16b
 ; CHECK-BE-NEXT:    tbl v2.16b, { v2.16b }, v0.16b
 ; CHECK-BE-NEXT:    ld1 { v19.2d }, [x12]
 ; CHECK-BE-NEXT:    tbl v5.16b, { v3.16b }, v1.16b
 ; CHECK-BE-NEXT:    tbl v3.16b, { v3.16b }, v0.16b
-; CHECK-BE-NEXT:    sub x13, x9, #16
-; CHECK-BE-NEXT:    sub x14, x9, #48
-; CHECK-BE-NEXT:    add x16, x9, #48
-; CHECK-BE-NEXT:    add x17, x9, #16
+; CHECK-BE-NEXT:    sub x13, x10, #16
+; CHECK-BE-NEXT:    sub x14, x10, #48
+; CHECK-BE-NEXT:    add x16, x10, #48
+; CHECK-BE-NEXT:    add x17, x10, #16
 ; CHECK-BE-NEXT:    ld1 { v22.2d }, [x13]
 ; CHECK-BE-NEXT:    subs x8, x8, #16
-; CHECK-BE-NEXT:    add x10, x10, #16
+; CHECK-BE-NEXT:    add x9, x9, #16
 ; CHECK-BE-NEXT:    rev32 v7.8b, v4.8b
 ; CHECK-BE-NEXT:    ext v4.16b, v4.16b, v4.16b, #8
 ; CHECK-BE-NEXT:    rev32 v17.8b, v2.8b
@@ -1753,8 +1753,8 @@ define void @zext_v8i8_to_v8i64_with_add_in_sequence_in_loop(ptr %src, ptr %dst)
 ; CHECK-BE-NEXT:    uaddw v3.2d, v21.2d, v3.2s
 ; CHECK-BE-NEXT:    st1 { v7.2d }, [x15]
 ; CHECK-BE-NEXT:    ld1 { v7.2d }, [x17]
-; CHECK-BE-NEXT:    st1 { v6.2d }, [x9]
-; CHECK-BE-NEXT:    add x9, x9, #128
+; CHECK-BE-NEXT:    st1 { v6.2d }, [x10]
+; CHECK-BE-NEXT:    add x10, x10, #128
 ; CHECK-BE-NEXT:    uaddw v4.2d, v16.2d, v4.2s
 ; CHECK-BE-NEXT:    st1 { v5.2d }, [x12]
 ; CHECK-BE-NEXT:    uaddw v5.2d, v22.2d, v17.2s