Skip to content

Commit 2948775

Browse files
authored
[LoopInterchange] Defer CacheCost calculation until needed (#146874)
LoopInterchange currently stop and exit its process when the number of loads/stores in the loop is greater than `MaxMemInstrCount`. This prevents excessive querying to DependenceAnalysis. However, computing `CacheCost` also involves DependenceAnalysis queries, and their number can grow to `O(N^2)` in the worst case, where `N` is the number of loads/stores in the loop. Therefore, we should also avoid calculating it if the loads/stores count exceeds `MaxMemInstrCount`. This patch defers the calculation of `CacheCost` until it is actually needed to reduce compile time. This avoids computing `CacheCost` when the number of loads/stores is large. Additionally, since this patch delays its calculation as much as possible, it is also effective in other scenarios, e.g., when there are no legal loop pairs to exchange.
1 parent 2e8e254 commit 2948775

File tree

2 files changed

+148
-34
lines changed

2 files changed

+148
-34
lines changed

llvm/lib/Transforms/Scalar/LoopInterchange.cpp

Lines changed: 71 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -407,6 +407,32 @@ class LoopInterchangeLegality {
407407
SmallVector<PHINode *, 8> InnerLoopInductions;
408408
};
409409

410+
/// Manages information utilized by the profitability check for cache. The main
411+
/// purpose of this class is to delay the computation of CacheCost until it is
412+
/// actually needed.
413+
class CacheCostManager {
414+
Loop *OutermostLoop;
415+
LoopStandardAnalysisResults *AR;
416+
DependenceInfo *DI;
417+
418+
/// CacheCost for \ref OutermostLoop. Once it is computed, it is cached. Note
419+
/// that the result can be nullptr.
420+
std::optional<std::unique_ptr<CacheCost>> CC;
421+
422+
/// Maps each loop to an index representing the optimal position within the
423+
/// loop-nest, as determined by the cache cost analysis.
424+
DenseMap<const Loop *, unsigned> CostMap;
425+
426+
void computeIfUnitinialized();
427+
428+
public:
429+
CacheCostManager(Loop *OutermostLoop, LoopStandardAnalysisResults *AR,
430+
DependenceInfo *DI)
431+
: OutermostLoop(OutermostLoop), AR(AR), DI(DI) {}
432+
CacheCost *getCacheCost();
433+
const DenseMap<const Loop *, unsigned> &getCostMap();
434+
};
435+
410436
/// LoopInterchangeProfitability checks if it is profitable to interchange the
411437
/// loop.
412438
class LoopInterchangeProfitability {
@@ -418,15 +444,12 @@ class LoopInterchangeProfitability {
418444
/// Check if the loop interchange is profitable.
419445
bool isProfitable(const Loop *InnerLoop, const Loop *OuterLoop,
420446
unsigned InnerLoopId, unsigned OuterLoopId,
421-
CharMatrix &DepMatrix,
422-
const DenseMap<const Loop *, unsigned> &CostMap,
423-
std::unique_ptr<CacheCost> &CC);
447+
CharMatrix &DepMatrix, CacheCostManager &CCM);
424448

425449
private:
426450
int getInstrOrderCost();
427451
std::optional<bool> isProfitablePerLoopCacheAnalysis(
428-
const DenseMap<const Loop *, unsigned> &CostMap,
429-
std::unique_ptr<CacheCost> &CC);
452+
const DenseMap<const Loop *, unsigned> &CostMap, CacheCost *CC);
430453
std::optional<bool> isProfitablePerInstrOrderCost();
431454
std::optional<bool> isProfitableForVectorization(unsigned InnerLoopId,
432455
unsigned OuterLoopId,
@@ -477,15 +500,15 @@ struct LoopInterchange {
477500
LoopInfo *LI = nullptr;
478501
DependenceInfo *DI = nullptr;
479502
DominatorTree *DT = nullptr;
480-
std::unique_ptr<CacheCost> CC = nullptr;
503+
LoopStandardAnalysisResults *AR = nullptr;
481504

482505
/// Interface to emit optimization remarks.
483506
OptimizationRemarkEmitter *ORE;
484507

485508
LoopInterchange(ScalarEvolution *SE, LoopInfo *LI, DependenceInfo *DI,
486-
DominatorTree *DT, std::unique_ptr<CacheCost> &CC,
509+
DominatorTree *DT, LoopStandardAnalysisResults *AR,
487510
OptimizationRemarkEmitter *ORE)
488-
: SE(SE), LI(LI), DI(DI), DT(DT), CC(std::move(CC)), ORE(ORE) {}
511+
: SE(SE), LI(LI), DI(DI), DT(DT), AR(AR), ORE(ORE) {}
489512

490513
bool run(Loop *L) {
491514
if (L->getParentLoop())
@@ -540,19 +563,7 @@ struct LoopInterchange {
540563
}
541564

542565
unsigned SelecLoopId = selectLoopForInterchange(LoopList);
543-
// Obtain the loop vector returned from loop cache analysis beforehand,
544-
// and put each <Loop, index> pair into a map for constant time query
545-
// later. Indices in loop vector reprsent the optimal order of the
546-
// corresponding loop, e.g., given a loopnest with depth N, index 0
547-
// indicates the loop should be placed as the outermost loop and index N
548-
// indicates the loop should be placed as the innermost loop.
549-
//
550-
// For the old pass manager CacheCost would be null.
551-
DenseMap<const Loop *, unsigned> CostMap;
552-
if (CC != nullptr) {
553-
for (const auto &[Idx, Cost] : enumerate(CC->getLoopCosts()))
554-
CostMap[Cost.first] = Idx;
555-
}
566+
CacheCostManager CCM(LoopList[0], AR, DI);
556567
// We try to achieve the globally optimal memory access for the loopnest,
557568
// and do interchange based on a bubble-sort fasion. We start from
558569
// the innermost loop, move it outwards to the best possible position
@@ -561,7 +572,7 @@ struct LoopInterchange {
561572
bool ChangedPerIter = false;
562573
for (unsigned i = SelecLoopId; i > SelecLoopId - j; i--) {
563574
bool Interchanged =
564-
processLoop(LoopList, i, i - 1, DependencyMatrix, CostMap);
575+
processLoop(LoopList, i, i - 1, DependencyMatrix, CCM);
565576
ChangedPerIter |= Interchanged;
566577
Changed |= Interchanged;
567578
}
@@ -576,7 +587,7 @@ struct LoopInterchange {
576587
bool processLoop(SmallVectorImpl<Loop *> &LoopList, unsigned InnerLoopId,
577588
unsigned OuterLoopId,
578589
std::vector<std::vector<char>> &DependencyMatrix,
579-
const DenseMap<const Loop *, unsigned> &CostMap) {
590+
CacheCostManager &CCM) {
580591
Loop *OuterLoop = LoopList[OuterLoopId];
581592
Loop *InnerLoop = LoopList[InnerLoopId];
582593
LLVM_DEBUG(dbgs() << "Processing InnerLoopId = " << InnerLoopId
@@ -589,7 +600,7 @@ struct LoopInterchange {
589600
LLVM_DEBUG(dbgs() << "Loops are legal to interchange\n");
590601
LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE);
591602
if (!LIP.isProfitable(InnerLoop, OuterLoop, InnerLoopId, OuterLoopId,
592-
DependencyMatrix, CostMap, CC)) {
603+
DependencyMatrix, CCM)) {
593604
LLVM_DEBUG(dbgs() << "Interchanging loops not profitable.\n");
594605
return false;
595606
}
@@ -1122,6 +1133,35 @@ bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
11221133
return true;
11231134
}
11241135

1136+
void CacheCostManager::computeIfUnitinialized() {
1137+
if (CC.has_value())
1138+
return;
1139+
1140+
LLVM_DEBUG(dbgs() << "Compute CacheCost.\n");
1141+
CC = CacheCost::getCacheCost(*OutermostLoop, *AR, *DI);
1142+
// Obtain the loop vector returned from loop cache analysis beforehand,
1143+
// and put each <Loop, index> pair into a map for constant time query
1144+
// later. Indices in loop vector reprsent the optimal order of the
1145+
// corresponding loop, e.g., given a loopnest with depth N, index 0
1146+
// indicates the loop should be placed as the outermost loop and index N
1147+
// indicates the loop should be placed as the innermost loop.
1148+
//
1149+
// For the old pass manager CacheCost would be null.
1150+
if (*CC != nullptr)
1151+
for (const auto &[Idx, Cost] : enumerate((*CC)->getLoopCosts()))
1152+
CostMap[Cost.first] = Idx;
1153+
}
1154+
1155+
CacheCost *CacheCostManager::getCacheCost() {
1156+
computeIfUnitinialized();
1157+
return CC->get();
1158+
}
1159+
1160+
const DenseMap<const Loop *, unsigned> &CacheCostManager::getCostMap() {
1161+
computeIfUnitinialized();
1162+
return CostMap;
1163+
}
1164+
11251165
int LoopInterchangeProfitability::getInstrOrderCost() {
11261166
unsigned GoodOrder, BadOrder;
11271167
BadOrder = GoodOrder = 0;
@@ -1177,8 +1217,7 @@ int LoopInterchangeProfitability::getInstrOrderCost() {
11771217

11781218
std::optional<bool>
11791219
LoopInterchangeProfitability::isProfitablePerLoopCacheAnalysis(
1180-
const DenseMap<const Loop *, unsigned> &CostMap,
1181-
std::unique_ptr<CacheCost> &CC) {
1220+
const DenseMap<const Loop *, unsigned> &CostMap, CacheCost *CC) {
11821221
// This is the new cost model returned from loop cache analysis.
11831222
// A smaller index means the loop should be placed an outer loop, and vice
11841223
// versa.
@@ -1246,9 +1285,7 @@ std::optional<bool> LoopInterchangeProfitability::isProfitableForVectorization(
12461285

12471286
bool LoopInterchangeProfitability::isProfitable(
12481287
const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId,
1249-
unsigned OuterLoopId, CharMatrix &DepMatrix,
1250-
const DenseMap<const Loop *, unsigned> &CostMap,
1251-
std::unique_ptr<CacheCost> &CC) {
1288+
unsigned OuterLoopId, CharMatrix &DepMatrix, CacheCostManager &CCM) {
12521289
// isProfitable() is structured to avoid endless loop interchange. If the
12531290
// highest priority rule (isProfitablePerLoopCacheAnalysis by default) could
12541291
// decide the profitability then, profitability check will stop and return the
@@ -1261,9 +1298,12 @@ bool LoopInterchangeProfitability::isProfitable(
12611298
std::optional<bool> shouldInterchange;
12621299
for (RuleTy RT : Profitabilities) {
12631300
switch (RT) {
1264-
case RuleTy::PerLoopCacheAnalysis:
1301+
case RuleTy::PerLoopCacheAnalysis: {
1302+
CacheCost *CC = CCM.getCacheCost();
1303+
const DenseMap<const Loop *, unsigned> &CostMap = CCM.getCostMap();
12651304
shouldInterchange = isProfitablePerLoopCacheAnalysis(CostMap, CC);
12661305
break;
1306+
}
12671307
case RuleTy::PerInstrOrderCost:
12681308
shouldInterchange = isProfitablePerInstrOrderCost();
12691309
break;
@@ -1841,10 +1881,7 @@ PreservedAnalyses LoopInterchangePass::run(LoopNest &LN,
18411881
});
18421882

18431883
DependenceInfo DI(&F, &AR.AA, &AR.SE, &AR.LI);
1844-
std::unique_ptr<CacheCost> CC =
1845-
CacheCost::getCacheCost(LN.getOutermostLoop(), AR, DI);
1846-
1847-
if (!LoopInterchange(&AR.SE, &AR.LI, &DI, &AR.DT, CC, &ORE).run(LN))
1884+
if (!LoopInterchange(&AR.SE, &AR.LI, &DI, &AR.DT, &AR, &ORE).run(LN))
18481885
return PreservedAnalyses::all();
18491886
U.markLoopNestChanged(true);
18501887
return getLoopPassPreservedAnalyses();
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
; REQUIRES: asserts
2+
3+
; RUN: opt -passes=loop-interchange -debug -disable-output %s 2>&1 | FileCheck %s
4+
5+
@A = global [16 x [16 x i32]] zeroinitializer
6+
7+
; Check that the CacheCost is calculated only when required. In this case, it
8+
; is computed after passing the legality check.
9+
;
10+
; for (i = 0; i < 16; i++)
11+
; for (j = 0; j < 16; j++)
12+
; A[j][i] += 1;
13+
14+
; CHECK: Loops are legal to interchange
15+
; CHECK: Compute CacheCost
16+
define void @legal_to_interchange() {
17+
entry:
18+
br label %for.i.header
19+
20+
for.i.header:
21+
%i = phi i32 [ 0, %entry ], [ %i.next, %for.i.latch ]
22+
br label %for.j
23+
24+
for.j:
25+
%j = phi i32 [ 0, %for.i.header ], [ %j.next, %for.j ]
26+
%idx = getelementptr inbounds [16 x [16 x i32]], ptr @A, i32 0, i32 %j, i32 %i
27+
%val = load i32, ptr %idx
28+
%inc = add i32 %val, 1
29+
store i32 %inc, ptr %idx
30+
%j.next = add i32 %j, 1
31+
%j.exit = icmp eq i32 %j.next, 16
32+
br i1 %j.exit, label %for.i.latch, label %for.j
33+
34+
for.i.latch:
35+
%i.next = add i32 %i, 1
36+
%i.exit = icmp eq i32 %i.next, 16
37+
br i1 %i.exit, label %exit, label %for.i.header
38+
39+
exit:
40+
ret void
41+
}
42+
43+
; Check that the CacheCost is not calculated when not required. In this case,
44+
; the legality check always fails so that we do not need to compute the
45+
; CacheCost.
46+
;
47+
; for (i = 0; i < 16; i++)
48+
; for (j = 0; j < 16; j++)
49+
; A[j][i] = A[i][j];
50+
51+
; CHECK-NOT: Compute CacheCost
52+
define void @illegal_to_interchange() {
53+
entry:
54+
br label %for.i.header
55+
56+
for.i.header:
57+
%i = phi i32 [ 0, %entry ], [ %i.next, %for.i.latch ]
58+
br label %for.j
59+
60+
for.j:
61+
%j = phi i32 [ 0, %for.i.header ], [ %j.next, %for.j ]
62+
%idx.load = getelementptr inbounds [16 x [16 x i32]], ptr @A, i32 0, i32 %i, i32 %j
63+
%idx.store = getelementptr inbounds [16 x [16 x i32]], ptr @A, i32 0, i32 %j, i32 %i
64+
%val = load i32, ptr %idx.load
65+
store i32 %val, ptr %idx.store
66+
%j.next = add i32 %j, 1
67+
%j.exit = icmp eq i32 %j.next, 16
68+
br i1 %j.exit, label %for.i.latch, label %for.j
69+
70+
for.i.latch:
71+
%i.next = add i32 %i, 1
72+
%i.exit = icmp eq i32 %i.next, 16
73+
br i1 %i.exit, label %exit, label %for.i.header
74+
75+
exit:
76+
ret void
77+
}

0 commit comments

Comments
 (0)