Skip to content

Commit c588b5e

Browse files
spupyrevmemfrob
authored andcommitted
speeding up caches for hfsort+
Summary: When running hfsort+, we invalidate too many cache entries, which leads to inefficiencies. It seems we only need to invalidate cache for pairs of clusters (Into, X) and (X, Into) when modifying cluster Into (for all clusters X). With the modification, we do not really need ShortCache, since it is computed only once per pair of clusters. (cherry picked from FBD6341039)
1 parent 30cc9b6 commit c588b5e

File tree

3 files changed

+22
-94
lines changed

3 files changed

+22
-94
lines changed

bolt/Passes/HFSort.h

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -103,9 +103,7 @@ std::vector<Cluster> clusterize(const CallGraph &Cg);
103103
/*
104104
* Optimize function placement for iTLB cache and i-cache.
105105
*/
106-
std::vector<Cluster> hfsortPlus(CallGraph &Cg,
107-
bool UseGainCache = true,
108-
bool UseShortCallCache = true);
106+
std::vector<Cluster> hfsortPlus(CallGraph &Cg, bool UseGainCache = true);
109107

110108
/*
111109
* Pettis-Hansen code layout algorithm

bolt/Passes/HFSortPlus.cpp

Lines changed: 20 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ using namespace llvm;
4646
using namespace bolt;
4747

4848
namespace opts {
49-
extern cl::OptionCategory BoltCategory;
49+
5050
extern cl::OptionCategory BoltOptCategory;
5151
extern cl::opt<bool> Verbosity;
5252

@@ -92,17 +92,6 @@ int32_t ITLBPageSize;
9292
// while smaller values result in better i-cache performance
9393
int32_t ITLBEntries;
9494

95-
const char* cacheKindString(bool UseGainCache, bool UseShortCallCache) {
96-
if (UseGainCache && UseShortCallCache)
97-
return "gain + short call cache";
98-
else if (UseGainCache)
99-
return "gain cache";
100-
else if (UseShortCallCache)
101-
return "short call cache";
102-
else
103-
return "no cache";
104-
}
105-
10695
// This class maintains adjacency information for all Clusters being
10796
// processed. It is used to invalidate cache entries when merging
10897
// Clusters and for visiting all neighbors of any given Cluster.
@@ -215,17 +204,16 @@ class PrecomputedResults {
215204
Valid[Index] = true;
216205
}
217206

218-
void invalidate(const AdjacencyMatrix &Adjacent, const Cluster *C) {
219-
invalidate(C);
220-
Adjacent.forallAdjacent(C, [&](const Cluster *A) { invalidate(A); });
221-
}
222-
private:
223207
void invalidate(const Cluster *C) {
224208
Valid.reset(C->id() * Size, (C->id() + 1) * Size);
209+
for (size_t Id = 0; Id < Size; Id++) {
210+
Valid.reset(Id * Size + C->id());
211+
}
225212
}
226213

214+
private:
227215
size_t index(const Cluster *First, const Cluster *Second) const {
228-
return (First->id() * Size) + Second->id();
216+
return First->id() * Size + Second->id();
229217
}
230218

231219
size_t Size;
@@ -347,12 +335,6 @@ class HFSortPlus {
347335
* the same cache page
348336
*/
349337
double shortCalls(const Cluster *Cluster) const {
350-
if (UseShortCallCache) {
351-
auto Itr = ShortCallCache.find(Cluster);
352-
if (Itr != ShortCallCache.end())
353-
return Itr->second;
354-
}
355-
356338
double Calls = 0;
357339
for (auto TargetId : Cluster->targets()) {
358340
for (auto Succ : Cg.successors(TargetId)) {
@@ -367,10 +349,6 @@ class HFSortPlus {
367349
}
368350
}
369351

370-
if (UseShortCallCache) {
371-
ShortCallCache[Cluster] = Calls;
372-
}
373-
374352
return Calls;
375353
}
376354

@@ -380,11 +358,6 @@ class HFSortPlus {
380358
*/
381359
double shortCalls(const Cluster *ClusterPred,
382360
const Cluster *ClusterSucc) const {
383-
if (UseShortCallCache &&
384-
ShortCallPairCache.contains(ClusterPred, ClusterSucc)) {
385-
return ShortCallPairCache.get(ClusterPred, ClusterSucc);
386-
}
387-
388361
double Calls = 0;
389362
for (auto TargetId : ClusterPred->targets()) {
390363
for (auto Succ : Cg.successors(TargetId)) {
@@ -413,10 +386,6 @@ class HFSortPlus {
413386
}
414387
}
415388

416-
if (UseShortCallCache) {
417-
ShortCallPairCache.set(ClusterPred, ClusterSucc, Calls);
418-
}
419-
420389
return Calls;
421390
}
422391

@@ -434,8 +403,8 @@ class HFSortPlus {
434403
*/
435404
double mergeGain(const Cluster *ClusterPred,
436405
const Cluster *ClusterSucc) const {
437-
if (UseGainCache && Cache.contains(ClusterPred, ClusterSucc)) {
438-
return Cache.get(ClusterPred, ClusterSucc);
406+
if (UseGainCache && GainCache.contains(ClusterPred, ClusterSucc)) {
407+
return GainCache.get(ClusterPred, ClusterSucc);
439408
}
440409

441410
// cache misses on the first cluster
@@ -460,7 +429,7 @@ class HFSortPlus {
460429
Gain /= std::min(ClusterPred->size(), ClusterSucc->size());
461430

462431
if (UseGainCache) {
463-
Cache.set(ClusterPred, ClusterSucc, Gain);
432+
GainCache.set(ClusterPred, ClusterSucc, Gain);
464433
}
465434

466435
return Gain;
@@ -513,7 +482,7 @@ class HFSortPlus {
513482
const double ProbOut =
514483
CallsFromPred > 0 ? CallsPredSucc / CallsFromPred : 0;
515484
assert(0.0 <= ProbOut && ProbOut <= 1.0 && "incorrect probability");
516-
485+
517486
// probability that the second cluster is called from the first one
518487
const double ProbIn =
519488
CallsToSucc > 0 ? CallsPredSucc / CallsToSucc : 0;
@@ -601,13 +570,12 @@ class HFSortPlus {
601570
*/
602571
std::vector<Cluster> run() {
603572
DEBUG(dbgs() << "Starting hfsort+ w/"
604-
<< cacheKindString(UseGainCache, UseShortCallCache)
573+
<< (UseGainCache ? "gain cache" : "no cache")
605574
<< " for " << Clusters.size() << " clusters "
606575
<< "with ITLBPageSize = " << ITLBPageSize << ", "
607576
<< "ITLBEntries = " << ITLBEntries << ", "
608577
<< "and MergeProbability = " << opts::MergeProbability << "\n");
609578

610-
611579
// Pass 1
612580
runPassOne();
613581

@@ -628,19 +596,15 @@ class HFSortPlus {
628596
return Result;
629597
}
630598

631-
HFSortPlus(const CallGraph &Cg,
632-
bool UseGainCache,
633-
bool UseShortCallCache)
599+
HFSortPlus(const CallGraph &Cg, bool UseGainCache)
634600
: Cg(Cg),
635601
FuncCluster(Cg.numNodes(), nullptr),
636602
Addr(Cg.numNodes(), InvalidAddr),
637603
TotalSamples(0.0),
638604
Clusters(initializeClusters()),
639605
Adjacent(Cg, Clusters, FuncCluster),
640606
UseGainCache(UseGainCache),
641-
UseShortCallCache(UseShortCallCache),
642-
Cache(Clusters.size()),
643-
ShortCallPairCache(Clusters.size()) {
607+
GainCache(Clusters.size()) {
644608
}
645609
private:
646610

@@ -696,31 +660,16 @@ class HFSortPlus {
696660
CurAddr = ((CurAddr + Align - 1) / Align) * Align;
697661
}
698662

699-
// Update caches
700-
invalidateCaches(Into);
663+
// Invalidate all cache entries associated with cluster Into
664+
if (UseGainCache) {
665+
GainCache.invalidate(Into);
666+
}
701667

702668
// Remove cluster From from the list of active clusters
703669
auto Iter = std::remove(Clusters.begin(), Clusters.end(), From);
704670
Clusters.erase(Iter, Clusters.end());
705671
}
706672

707-
/*
708-
* Invalidate all cache entries associated with cluster C and its neighbors.
709-
*/
710-
void invalidateCaches(const Cluster *C) {
711-
if (UseShortCallCache) {
712-
maybeErase(ShortCallCache, C);
713-
Adjacent.forallAdjacent(C,
714-
[this](const Cluster *A) {
715-
maybeErase(ShortCallCache, A);
716-
});
717-
ShortCallPairCache.invalidate(Adjacent, C);
718-
}
719-
if (UseGainCache) {
720-
Cache.invalidate(Adjacent, C);
721-
}
722-
}
723-
724673
// The call graph
725674
const CallGraph &Cg;
726675

@@ -746,32 +695,21 @@ class HFSortPlus {
746695
// Use cache for mergeGain results
747696
bool UseGainCache;
748697

749-
// Use caches for shortCalls results
750-
bool UseShortCallCache;
751-
752698
// A cache that keeps precomputed values of mergeGain for pairs of clusters;
753699
// when a pair of clusters (x,y) gets merged, we need to invalidate the pairs
754700
// containing both x and y and all clusters adjacent to x and y (and recompute
755701
// them on the next iteration).
756-
mutable PrecomputedResults Cache;
757-
758-
// Cache for shortCalls for a single cluster.
759-
mutable std::unordered_map<const Cluster *, double> ShortCallCache;
760-
761-
// Cache for shortCalls for a pair of Clusters
762-
mutable PrecomputedResults ShortCallPairCache;
702+
mutable PrecomputedResults GainCache;
763703
};
764704

765705
}
766706

767-
std::vector<Cluster> hfsortPlus(CallGraph &Cg,
768-
bool UseGainCache,
769-
bool UseShortCallCache) {
707+
std::vector<Cluster> hfsortPlus(CallGraph &Cg, bool UseGainCache) {
770708
// It is required that the sum of incoming arc weights is not greater
771709
// than the number of samples for every function.
772710
// Ensuring the call graph obeys the property before running the algorithm.
773711
Cg.adjustArcWeights();
774-
return HFSortPlus(Cg, UseGainCache, UseShortCallCache).run();
712+
return HFSortPlus(Cg, UseGainCache).run();
775713
}
776714

777715
}}

bolt/Passes/ReorderFunctions.cpp

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -119,14 +119,6 @@ UseGainCache("hfsort+-use-cache",
119119
llvm::cl::Hidden,
120120
llvm::cl::cat(BoltOptCategory));
121121

122-
static llvm::cl::opt<bool>
123-
UseShortCallCache("hfsort+-use-short-call-cache",
124-
llvm::cl::desc("Use a cache for shortCall results when computing hfsort+."),
125-
llvm::cl::ZeroOrMore,
126-
llvm::cl::init(true),
127-
llvm::cl::Hidden,
128-
llvm::cl::cat(BoltOptCategory));
129-
130122
} // namespace opts
131123

132124
namespace llvm {
@@ -353,7 +345,7 @@ void ReorderFunctions::runOnFunctions(BinaryContext &BC,
353345
Clusters = clusterize(Cg);
354346
break;
355347
case BinaryFunction::RT_HFSORT_PLUS:
356-
Clusters = hfsortPlus(Cg, opts::UseGainCache, opts::UseShortCallCache);
348+
Clusters = hfsortPlus(Cg, opts::UseGainCache);
357349
break;
358350
case BinaryFunction::RT_PETTIS_HANSEN:
359351
Clusters = pettisAndHansen(Cg);

0 commit comments

Comments
 (0)