[LoadStoreVectorizer] Batch alias analysis results to improve compile time (#147555)

dakersnar · web-flow · commit 8e7461e29a7c · 2025-07-10T11:23:33.000-05:00
This should be generally good for a lot of LSV cases, but the attached
test demonstrates a specific compile time issue that appears in the
event where the `CaptureTracking` default max uses is raised.

Without using batching alias analysis, this test takes 6 seconds to
compile in a release build. With, less than a second. This is because
the mechanism that proves `NoAlias` in this case is very expensive
(`CaptureTracking.cpp`), and caching the result leads to 2 calls to that
mechanism instead of ~300,000 (run with -stats to see the difference)

This test only demonstrates the compile time issue if
`capture-tracking-max-uses-to-explore` is set to at least 1024, because
with the default value of 100, the `CaptureTracking` analysis is not
run, `NoAlias` is not proven, and the vectorizer gives up early.
diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -322,7 +322,8 @@ class Vectorizer {
   template <bool IsLoadChain>
   bool isSafeToMove(
       Instruction *ChainElem, Instruction *ChainBegin,
-      const DenseMap<Instruction *, APInt /*OffsetFromLeader*/> &ChainOffsets);
+      const DenseMap<Instruction *, APInt /*OffsetFromLeader*/> &ChainOffsets,
+      BatchAAResults &BatchAA);
 
   /// Merges the equivalence classes if they have underlying objects that differ
   /// by one level of indirection (i.e., one is a getelementptr and the other is
@@ -543,6 +544,10 @@ std::vector<Chain> Vectorizer::splitChainByMayAliasInstrs(Chain &C) {
   for (const auto &E : C)
     ChainOffsets.insert({&*E.Inst, E.OffsetFromLeader});
 
+  // Across a single invocation of this function the IR is not changing, so
+  // using a batched Alias Analysis is safe and can reduce compile time.
+  BatchAAResults BatchAA(AA);
+
   // Loads get hoisted up to the first load in the chain.  Stores get sunk
   // down to the last store in the chain.  Our algorithm for loads is:
   //
@@ -569,7 +574,7 @@ std::vector<Chain> Vectorizer::splitChainByMayAliasInstrs(Chain &C) {
     NewChain.emplace_back(*ChainBegin);
     for (auto ChainIt = std::next(ChainBegin); ChainIt != ChainEnd; ++ChainIt) {
       if (isSafeToMove<IsLoad>(ChainIt->Inst, NewChain.front().Inst,
-                               ChainOffsets)) {
+                               ChainOffsets, BatchAA)) {
         LLVM_DEBUG(dbgs() << "LSV: No intervening may-alias instrs; can merge "
                           << *ChainIt->Inst << " into " << *ChainBegin->Inst
                           << "\n");
@@ -999,7 +1004,8 @@ bool Vectorizer::vectorizeChain(Chain &C) {
 template <bool IsLoadChain>
 bool Vectorizer::isSafeToMove(
     Instruction *ChainElem, Instruction *ChainBegin,
-    const DenseMap<Instruction *, APInt /*OffsetFromLeader*/> &ChainOffsets) {
+    const DenseMap<Instruction *, APInt /*OffsetFromLeader*/> &ChainOffsets,
+    BatchAAResults &BatchAA) {
   LLVM_DEBUG(dbgs() << "LSV: isSafeToMove(" << *ChainElem << " -> "
                     << *ChainBegin << ")\n");
 
@@ -1066,7 +1072,8 @@ bool Vectorizer::isSafeToMove(
         LLVM_DEBUG({
           // Double check that AA also sees this alias.  If not, we probably
           // have a bug.
-          ModRefInfo MR = AA.getModRefInfo(I, MemoryLocation::get(ChainElem));
+          ModRefInfo MR =
+              BatchAA.getModRefInfo(I, MemoryLocation::get(ChainElem));
           assert(IsLoadChain ? isModSet(MR) : isModOrRefSet(MR));
           dbgs() << "LSV: Found alias in chain: " << *I << "\n";
         });
@@ -1077,7 +1084,7 @@ bool Vectorizer::isSafeToMove(
     }
 
     LLVM_DEBUG(dbgs() << "LSV: Querying AA for " << *I << "\n");
-    ModRefInfo MR = AA.getModRefInfo(I, MemoryLocation::get(ChainElem));
+    ModRefInfo MR = BatchAA.getModRefInfo(I, MemoryLocation::get(ChainElem));
     if (IsLoadChain ? isModSet(MR) : isModOrRefSet(MR)) {
       LLVM_DEBUG(dbgs() << "LSV: Found alias in chain:\n"
                         << "  Aliasing instruction:\n"
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/batch-aa-compile-time.ll b/llvm/test/Transforms/LoadStoreVectorizer/batch-aa-compile-time.ll
@@ -0,0 +1,60 @@
+; RUN: opt -S < %s -passes='loop-unroll,load-store-vectorizer' -unroll-count=128 --capture-tracking-max-uses-to-explore=1024 | FileCheck %s
+
+; Without using batching alias analysis, this test takes 6 seconds to compile. With, less than a second.
+; This is because the mechanism that proves NoAlias in this case is very expensive (CaptureTracking.cpp),
+; and caching the result leads to 2 calls to that mechanism instead of ~300,000 (run with -stats to see the difference)
+
+; This test only demonstrates the compile time issue if capture-tracking-max-uses-to-explore is set to at least 1024,
+; because with the default value of 100, the CaptureTracking analysis is not run, NoAlias is not proven, and the vectorizer gives up early.
+
+@global_mem = external global i8, align 4
+
+define void @compile-time-test() {
+; CHECK-LABEL: define void @compile-time-test() {
+; CHECK-COUNT-128: load <4 x i8>
+entry:
+  ; Create base pointer to a global variable with the inefficient pattern that Alias Analysis cannot easily traverse through.
+  %global_base_loads = getelementptr i8, ptr inttoptr (i32 ptrtoint (ptr @global_mem to i32) to ptr), i64 0
+
+  ; Create another pointer for the stores.
+  %local_base_stores = alloca <512 x i8>, align 4
+
+  ; 512 interwoven loads and stores in a loop that gets unrolled
+  br label %loop
+
+loop:
+  %i = phi i64 [ 0, %entry ], [ %i_next, %loop ]
+
+  %ptr_0 = getelementptr i8, ptr %global_base_loads, i64 %i
+  %load_0 = load i8, ptr %ptr_0, align 4
+  %ptr2_0 = getelementptr i8, ptr %local_base_stores, i64 %i
+  store i8 %load_0, ptr %ptr2_0, align 4
+
+  %i_1 = add i64 %i, 1
+
+  %ptr_1 = getelementptr i8, ptr %global_base_loads, i64 %i_1
+  %load_1 = load i8, ptr %ptr_1, align 1
+  %ptr2_1 = getelementptr i8, ptr %local_base_stores, i64 %i_1
+  store i8 %load_1, ptr %ptr2_1, align 1
+
+  %i_2 = add i64 %i, 2
+
+  %ptr_2 = getelementptr i8, ptr %global_base_loads, i64 %i_2
+  %load_2 = load i8, ptr %ptr_2, align 2
+  %ptr2_2 = getelementptr i8, ptr %local_base_stores, i64 %i_2
+  store i8 %load_2, ptr %ptr2_2, align 2
+
+  %i_3 = add i64 %i, 3
+
+  %ptr_3 = getelementptr i8, ptr %global_base_loads, i64 %i_3
+  %load_3 = load i8, ptr %ptr_3, align 1
+  %ptr2_3 = getelementptr i8, ptr %local_base_stores, i64 %i_3
+  store i8 %load_3, ptr %ptr2_3, align 1
+
+  %i_next = add i64 %i, 4
+  %cmp = icmp ult i64 %i_next, 512
+  br i1 %cmp, label %loop, label %done
+
+done:
+  ret void
+}