Skip to content

Commit 8e7461e

Browse files
authored
[LoadStoreVectorizer] Batch alias analysis results to improve compile time (#147555)
This should be generally good for a lot of LSV cases, but the attached test demonstrates a specific compile time issue that appears in the event where the `CaptureTracking` default max uses is raised. Without using batching alias analysis, this test takes 6 seconds to compile in a release build. With, less than a second. This is because the mechanism that proves `NoAlias` in this case is very expensive (`CaptureTracking.cpp`), and caching the result leads to 2 calls to that mechanism instead of ~300,000 (run with -stats to see the difference) This test only demonstrates the compile time issue if `capture-tracking-max-uses-to-explore` is set to at least 1024, because with the default value of 100, the `CaptureTracking` analysis is not run, `NoAlias` is not proven, and the vectorizer gives up early.
1 parent 54ec521 commit 8e7461e

File tree

2 files changed

+72
-5
lines changed

2 files changed

+72
-5
lines changed

llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -322,7 +322,8 @@ class Vectorizer {
322322
template <bool IsLoadChain>
323323
bool isSafeToMove(
324324
Instruction *ChainElem, Instruction *ChainBegin,
325-
const DenseMap<Instruction *, APInt /*OffsetFromLeader*/> &ChainOffsets);
325+
const DenseMap<Instruction *, APInt /*OffsetFromLeader*/> &ChainOffsets,
326+
BatchAAResults &BatchAA);
326327

327328
/// Merges the equivalence classes if they have underlying objects that differ
328329
/// by one level of indirection (i.e., one is a getelementptr and the other is
@@ -543,6 +544,10 @@ std::vector<Chain> Vectorizer::splitChainByMayAliasInstrs(Chain &C) {
543544
for (const auto &E : C)
544545
ChainOffsets.insert({&*E.Inst, E.OffsetFromLeader});
545546

547+
// Across a single invocation of this function the IR is not changing, so
548+
// using a batched Alias Analysis is safe and can reduce compile time.
549+
BatchAAResults BatchAA(AA);
550+
546551
// Loads get hoisted up to the first load in the chain. Stores get sunk
547552
// down to the last store in the chain. Our algorithm for loads is:
548553
//
@@ -569,7 +574,7 @@ std::vector<Chain> Vectorizer::splitChainByMayAliasInstrs(Chain &C) {
569574
NewChain.emplace_back(*ChainBegin);
570575
for (auto ChainIt = std::next(ChainBegin); ChainIt != ChainEnd; ++ChainIt) {
571576
if (isSafeToMove<IsLoad>(ChainIt->Inst, NewChain.front().Inst,
572-
ChainOffsets)) {
577+
ChainOffsets, BatchAA)) {
573578
LLVM_DEBUG(dbgs() << "LSV: No intervening may-alias instrs; can merge "
574579
<< *ChainIt->Inst << " into " << *ChainBegin->Inst
575580
<< "\n");
@@ -999,7 +1004,8 @@ bool Vectorizer::vectorizeChain(Chain &C) {
9991004
template <bool IsLoadChain>
10001005
bool Vectorizer::isSafeToMove(
10011006
Instruction *ChainElem, Instruction *ChainBegin,
1002-
const DenseMap<Instruction *, APInt /*OffsetFromLeader*/> &ChainOffsets) {
1007+
const DenseMap<Instruction *, APInt /*OffsetFromLeader*/> &ChainOffsets,
1008+
BatchAAResults &BatchAA) {
10031009
LLVM_DEBUG(dbgs() << "LSV: isSafeToMove(" << *ChainElem << " -> "
10041010
<< *ChainBegin << ")\n");
10051011

@@ -1066,7 +1072,8 @@ bool Vectorizer::isSafeToMove(
10661072
LLVM_DEBUG({
10671073
// Double check that AA also sees this alias. If not, we probably
10681074
// have a bug.
1069-
ModRefInfo MR = AA.getModRefInfo(I, MemoryLocation::get(ChainElem));
1075+
ModRefInfo MR =
1076+
BatchAA.getModRefInfo(I, MemoryLocation::get(ChainElem));
10701077
assert(IsLoadChain ? isModSet(MR) : isModOrRefSet(MR));
10711078
dbgs() << "LSV: Found alias in chain: " << *I << "\n";
10721079
});
@@ -1077,7 +1084,7 @@ bool Vectorizer::isSafeToMove(
10771084
}
10781085

10791086
LLVM_DEBUG(dbgs() << "LSV: Querying AA for " << *I << "\n");
1080-
ModRefInfo MR = AA.getModRefInfo(I, MemoryLocation::get(ChainElem));
1087+
ModRefInfo MR = BatchAA.getModRefInfo(I, MemoryLocation::get(ChainElem));
10811088
if (IsLoadChain ? isModSet(MR) : isModOrRefSet(MR)) {
10821089
LLVM_DEBUG(dbgs() << "LSV: Found alias in chain:\n"
10831090
<< " Aliasing instruction:\n"
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
; RUN: opt -S < %s -passes='loop-unroll,load-store-vectorizer' -unroll-count=128 --capture-tracking-max-uses-to-explore=1024 | FileCheck %s
2+
3+
; Without using batching alias analysis, this test takes 6 seconds to compile. With, less than a second.
4+
; This is because the mechanism that proves NoAlias in this case is very expensive (CaptureTracking.cpp),
5+
; and caching the result leads to 2 calls to that mechanism instead of ~300,000 (run with -stats to see the difference)
6+
7+
; This test only demonstrates the compile time issue if capture-tracking-max-uses-to-explore is set to at least 1024,
8+
; because with the default value of 100, the CaptureTracking analysis is not run, NoAlias is not proven, and the vectorizer gives up early.
9+
10+
@global_mem = external global i8, align 4
11+
12+
define void @compile-time-test() {
13+
; CHECK-LABEL: define void @compile-time-test() {
14+
; CHECK-COUNT-128: load <4 x i8>
15+
entry:
16+
; Create base pointer to a global variable with the inefficient pattern that Alias Analysis cannot easily traverse through.
17+
%global_base_loads = getelementptr i8, ptr inttoptr (i32 ptrtoint (ptr @global_mem to i32) to ptr), i64 0
18+
19+
; Create another pointer for the stores.
20+
%local_base_stores = alloca <512 x i8>, align 4
21+
22+
; 512 interwoven loads and stores in a loop that gets unrolled
23+
br label %loop
24+
25+
loop:
26+
%i = phi i64 [ 0, %entry ], [ %i_next, %loop ]
27+
28+
%ptr_0 = getelementptr i8, ptr %global_base_loads, i64 %i
29+
%load_0 = load i8, ptr %ptr_0, align 4
30+
%ptr2_0 = getelementptr i8, ptr %local_base_stores, i64 %i
31+
store i8 %load_0, ptr %ptr2_0, align 4
32+
33+
%i_1 = add i64 %i, 1
34+
35+
%ptr_1 = getelementptr i8, ptr %global_base_loads, i64 %i_1
36+
%load_1 = load i8, ptr %ptr_1, align 1
37+
%ptr2_1 = getelementptr i8, ptr %local_base_stores, i64 %i_1
38+
store i8 %load_1, ptr %ptr2_1, align 1
39+
40+
%i_2 = add i64 %i, 2
41+
42+
%ptr_2 = getelementptr i8, ptr %global_base_loads, i64 %i_2
43+
%load_2 = load i8, ptr %ptr_2, align 2
44+
%ptr2_2 = getelementptr i8, ptr %local_base_stores, i64 %i_2
45+
store i8 %load_2, ptr %ptr2_2, align 2
46+
47+
%i_3 = add i64 %i, 3
48+
49+
%ptr_3 = getelementptr i8, ptr %global_base_loads, i64 %i_3
50+
%load_3 = load i8, ptr %ptr_3, align 1
51+
%ptr2_3 = getelementptr i8, ptr %local_base_stores, i64 %i_3
52+
store i8 %load_3, ptr %ptr2_3, align 1
53+
54+
%i_next = add i64 %i, 4
55+
%cmp = icmp ult i64 %i_next, 512
56+
br i1 %cmp, label %loop, label %done
57+
58+
done:
59+
ret void
60+
}

0 commit comments

Comments
 (0)