diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp index 89f63c3b66aad..7b5137b0185ab 100644 --- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -322,7 +322,8 @@ class Vectorizer { template bool isSafeToMove( Instruction *ChainElem, Instruction *ChainBegin, - const DenseMap &ChainOffsets); + const DenseMap &ChainOffsets, + BatchAAResults &BatchAA); /// Merges the equivalence classes if they have underlying objects that differ /// by one level of indirection (i.e., one is a getelementptr and the other is @@ -543,6 +544,10 @@ std::vector Vectorizer::splitChainByMayAliasInstrs(Chain &C) { for (const auto &E : C) ChainOffsets.insert({&*E.Inst, E.OffsetFromLeader}); + // Across a single invocation of this function the IR is not changing, so + // using a batched Alias Analysis is safe and can reduce compile time. + BatchAAResults BatchAA(AA); + // Loads get hoisted up to the first load in the chain. Stores get sunk // down to the last store in the chain. Our algorithm for loads is: // @@ -569,7 +574,7 @@ std::vector Vectorizer::splitChainByMayAliasInstrs(Chain &C) { NewChain.emplace_back(*ChainBegin); for (auto ChainIt = std::next(ChainBegin); ChainIt != ChainEnd; ++ChainIt) { if (isSafeToMove(ChainIt->Inst, NewChain.front().Inst, - ChainOffsets)) { + ChainOffsets, BatchAA)) { LLVM_DEBUG(dbgs() << "LSV: No intervening may-alias instrs; can merge " << *ChainIt->Inst << " into " << *ChainBegin->Inst << "\n"); @@ -999,7 +1004,8 @@ bool Vectorizer::vectorizeChain(Chain &C) { template bool Vectorizer::isSafeToMove( Instruction *ChainElem, Instruction *ChainBegin, - const DenseMap &ChainOffsets) { + const DenseMap &ChainOffsets, + BatchAAResults &BatchAA) { LLVM_DEBUG(dbgs() << "LSV: isSafeToMove(" << *ChainElem << " -> " << *ChainBegin << ")\n"); @@ -1066,7 +1072,8 @@ bool Vectorizer::isSafeToMove( LLVM_DEBUG({ // Double check that AA also sees this alias. If not, we probably // have a bug. - ModRefInfo MR = AA.getModRefInfo(I, MemoryLocation::get(ChainElem)); + ModRefInfo MR = + BatchAA.getModRefInfo(I, MemoryLocation::get(ChainElem)); assert(IsLoadChain ? isModSet(MR) : isModOrRefSet(MR)); dbgs() << "LSV: Found alias in chain: " << *I << "\n"; }); @@ -1077,7 +1084,7 @@ bool Vectorizer::isSafeToMove( } LLVM_DEBUG(dbgs() << "LSV: Querying AA for " << *I << "\n"); - ModRefInfo MR = AA.getModRefInfo(I, MemoryLocation::get(ChainElem)); + ModRefInfo MR = BatchAA.getModRefInfo(I, MemoryLocation::get(ChainElem)); if (IsLoadChain ? isModSet(MR) : isModOrRefSet(MR)) { LLVM_DEBUG(dbgs() << "LSV: Found alias in chain:\n" << " Aliasing instruction:\n" diff --git a/llvm/test/Transforms/LoadStoreVectorizer/batch-aa-compile-time.ll b/llvm/test/Transforms/LoadStoreVectorizer/batch-aa-compile-time.ll new file mode 100644 index 0000000000000..049c9a3f70e0f --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/batch-aa-compile-time.ll @@ -0,0 +1,60 @@ +; RUN: opt -S < %s -passes='loop-unroll,load-store-vectorizer' -unroll-count=128 --capture-tracking-max-uses-to-explore=1024 | FileCheck %s + +; Without using batching alias analysis, this test takes 6 seconds to compile. With, less than a second. +; This is because the mechanism that proves NoAlias in this case is very expensive (CaptureTracking.cpp), +; and caching the result leads to 2 calls to that mechanism instead of ~300,000 (run with -stats to see the difference) + +; This test only demonstrates the compile time issue if capture-tracking-max-uses-to-explore is set to at least 1024, +; because with the default value of 100, the CaptureTracking analysis is not run, NoAlias is not proven, and the vectorizer gives up early. + +@global_mem = external global i8, align 4 + +define void @compile-time-test() { +; CHECK-LABEL: define void @compile-time-test() { +; CHECK-COUNT-128: load <4 x i8> +entry: + ; Create base pointer to a global variable with the inefficient pattern that Alias Analysis cannot easily traverse through. + %global_base_loads = getelementptr i8, ptr inttoptr (i32 ptrtoint (ptr @global_mem to i32) to ptr), i64 0 + + ; Create another pointer for the stores. + %local_base_stores = alloca <512 x i8>, align 4 + + ; 512 interwoven loads and stores in a loop that gets unrolled + br label %loop + +loop: + %i = phi i64 [ 0, %entry ], [ %i_next, %loop ] + + %ptr_0 = getelementptr i8, ptr %global_base_loads, i64 %i + %load_0 = load i8, ptr %ptr_0, align 4 + %ptr2_0 = getelementptr i8, ptr %local_base_stores, i64 %i + store i8 %load_0, ptr %ptr2_0, align 4 + + %i_1 = add i64 %i, 1 + + %ptr_1 = getelementptr i8, ptr %global_base_loads, i64 %i_1 + %load_1 = load i8, ptr %ptr_1, align 1 + %ptr2_1 = getelementptr i8, ptr %local_base_stores, i64 %i_1 + store i8 %load_1, ptr %ptr2_1, align 1 + + %i_2 = add i64 %i, 2 + + %ptr_2 = getelementptr i8, ptr %global_base_loads, i64 %i_2 + %load_2 = load i8, ptr %ptr_2, align 2 + %ptr2_2 = getelementptr i8, ptr %local_base_stores, i64 %i_2 + store i8 %load_2, ptr %ptr2_2, align 2 + + %i_3 = add i64 %i, 3 + + %ptr_3 = getelementptr i8, ptr %global_base_loads, i64 %i_3 + %load_3 = load i8, ptr %ptr_3, align 1 + %ptr2_3 = getelementptr i8, ptr %local_base_stores, i64 %i_3 + store i8 %load_3, ptr %ptr2_3, align 1 + + %i_next = add i64 %i, 4 + %cmp = icmp ult i64 %i_next, 512 + br i1 %cmp, label %loop, label %done + +done: + ret void +}