Skip to content

Commit 503deec

Browse files
committed
Temporairly revert "[SimplifyCFG][LoopRotate] SimplifyCFG: disable common instruction hoisting by default, enable late in pipeline"
As disscussed in post-commit review starting with https://reviews.llvm.org/D84108#2227365 while this appears to be mostly a win overall, especially code-size-wise, this appears to shake //certain// code pattens in a way that is extremely unfavorable for performance (+30% runtime regression) on certain CPU's (i personally can't reproduce). So until the behaviour is better understood, and a path forward is mapped, let's back this out for now. This reverts commit 1d51dc3.
1 parent 5eff21c commit 503deec

File tree

10 files changed

+26
-39
lines changed

10 files changed

+26
-39
lines changed

llvm/include/llvm/Transforms/Utils/SimplifyCFGOptions.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ struct SimplifyCFGOptions {
2525
bool ForwardSwitchCondToPhi = false;
2626
bool ConvertSwitchToLookupTable = false;
2727
bool NeedCanonicalLoop = true;
28-
bool HoistCommonInsts = false;
28+
bool HoistCommonInsts = true;
2929
bool SinkCommonInsts = false;
3030
bool SimplifyCondBranch = true;
3131
bool FoldTwoEntryPHINode = true;

llvm/lib/Passes/PassBuilder.cpp

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1147,14 +1147,11 @@ ModulePassManager PassBuilder::buildModuleOptimizationPipeline(
11471147
// convert to more optimized IR using more aggressive simplify CFG options.
11481148
// The extra sinking transform can create larger basic blocks, so do this
11491149
// before SLP vectorization.
1150-
// FIXME: study whether hoisting and/or sinking of common instructions should
1151-
// be delayed until after SLP vectorizer.
1152-
OptimizePM.addPass(SimplifyCFGPass(SimplifyCFGOptions()
1153-
.forwardSwitchCondToPhi(true)
1154-
.convertSwitchToLookupTable(true)
1155-
.needCanonicalLoops(false)
1156-
.hoistCommonInsts(true)
1157-
.sinkCommonInsts(true)));
1150+
OptimizePM.addPass(SimplifyCFGPass(SimplifyCFGOptions().
1151+
forwardSwitchCondToPhi(true).
1152+
convertSwitchToLookupTable(true).
1153+
needCanonicalLoops(false).
1154+
sinkCommonInsts(true)));
11581155

11591156
// Optimize parallel scalar instruction chains into SIMD instructions.
11601157
if (PTO.SLPVectorization)

llvm/lib/Target/AArch64/AArch64TargetMachine.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -457,7 +457,6 @@ void AArch64PassConfig::addIRPasses() {
457457
.forwardSwitchCondToPhi(true)
458458
.convertSwitchToLookupTable(true)
459459
.needCanonicalLoops(false)
460-
.hoistCommonInsts(true)
461460
.sinkCommonInsts(true)));
462461

463462
// Run LoopDataPrefetch

llvm/lib/Target/ARM/ARMTargetMachine.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -409,8 +409,7 @@ void ARMPassConfig::addIRPasses() {
409409
// ldrex/strex loops to simplify this, but it needs tidying up.
410410
if (TM->getOptLevel() != CodeGenOpt::None && EnableAtomicTidy)
411411
addPass(createCFGSimplificationPass(
412-
SimplifyCFGOptions().hoistCommonInsts(true).sinkCommonInsts(true),
413-
[this](const Function &F) {
412+
SimplifyCFGOptions().sinkCommonInsts(true), [this](const Function &F) {
414413
const auto &ST = this->TM->getSubtarget<ARMSubtarget>(F);
415414
return ST.hasAnyDataBarrier() && !ST.isThumb1Only();
416415
}));

llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -329,7 +329,6 @@ void HexagonPassConfig::addIRPasses() {
329329
.forwardSwitchCondToPhi(true)
330330
.convertSwitchToLookupTable(true)
331331
.needCanonicalLoops(false)
332-
.hoistCommonInsts(true)
333332
.sinkCommonInsts(true)));
334333
if (EnableLoopPrefetch)
335334
addPass(createLoopDataPrefetchPass());

llvm/lib/Transforms/IPO/PassManagerBuilder.cpp

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -784,13 +784,10 @@ void PassManagerBuilder::populateModulePassManager(
784784
// convert to more optimized IR using more aggressive simplify CFG options.
785785
// The extra sinking transform can create larger basic blocks, so do this
786786
// before SLP vectorization.
787-
// FIXME: study whether hoisting and/or sinking of common instructions should
788-
// be delayed until after SLP vectorizer.
789787
MPM.add(createCFGSimplificationPass(SimplifyCFGOptions()
790788
.forwardSwitchCondToPhi(true)
791789
.convertSwitchToLookupTable(true)
792790
.needCanonicalLoops(false)
793-
.hoistCommonInsts(true)
794791
.sinkCommonInsts(true)));
795792

796793
if (SLPVectorize) {

llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,8 +63,8 @@ static cl::opt<bool> UserForwardSwitchCond(
6363
cl::desc("Forward switch condition to phi ops (default = false)"));
6464

6565
static cl::opt<bool> UserHoistCommonInsts(
66-
"hoist-common-insts", cl::Hidden, cl::init(false),
67-
cl::desc("hoist common instructions (default = false)"));
66+
"hoist-common-insts", cl::Hidden, cl::init(true),
67+
cl::desc("hoist common instructions (default = true)"));
6868

6969
static cl::opt<bool> UserSinkCommonInsts(
7070
"sink-common-insts", cl::Hidden, cl::init(false),

llvm/test/Transforms/PGOProfile/chr.ll

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2008,16 +2008,9 @@ define i64 @test_chr_22(i1 %i, i64* %j, i64 %v0) !prof !14 {
20082008
; CHECK-NEXT: bb0:
20092009
; CHECK-NEXT: [[REASS_ADD:%.*]] = shl i64 [[V0:%.*]], 1
20102010
; CHECK-NEXT: [[V2:%.*]] = add i64 [[REASS_ADD]], 3
2011-
; CHECK-NEXT: [[C1:%.*]] = icmp slt i64 [[V2]], 100
2012-
; CHECK-NEXT: br i1 [[C1]], label [[BB0_SPLIT:%.*]], label [[BB0_SPLIT_NONCHR:%.*]], !prof !15
2013-
; CHECK: bb0.split:
20142011
; CHECK-NEXT: [[V299:%.*]] = mul i64 [[V2]], 7860086430977039991
20152012
; CHECK-NEXT: store i64 [[V299]], i64* [[J:%.*]], align 4
20162013
; CHECK-NEXT: ret i64 99
2017-
; CHECK: bb0.split.nonchr:
2018-
; CHECK-NEXT: [[V299_NONCHR:%.*]] = mul i64 [[V2]], 7860086430977039991
2019-
; CHECK-NEXT: store i64 [[V299_NONCHR]], i64* [[J]], align 4
2020-
; CHECK-NEXT: ret i64 99
20212014
;
20222015
bb0:
20232016
%v1 = add i64 %v0, 3

llvm/test/Transforms/PhaseOrdering/loop-rotation-vs-common-code-hoisting.ll

Lines changed: 16 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,14 @@
55
; RUN: opt -O3 -rotation-max-header-size=1 -S < %s | FileCheck %s --check-prefixes=HOIST,THR1,FALLBACK2
66
; RUN: opt -passes='default<O3>' -rotation-max-header-size=1 -S < %s | FileCheck %s --check-prefixes=HOIST,THR1,FALLBACK3
77

8-
; RUN: opt -O3 -rotation-max-header-size=2 -S < %s | FileCheck %s --check-prefixes=ROTATED_LATER,ROTATED_LATER_OLDPM,FALLBACK4
9-
; RUN: opt -passes='default<O3>' -rotation-max-header-size=2 -S < %s | FileCheck %s --check-prefixes=ROTATED_LATER,ROTATED_LATER_NEWPM,FALLBACK5
8+
; RUN: opt -O3 -rotation-max-header-size=2 -S < %s | FileCheck %s --check-prefixes=HOIST,THR2,FALLBACK4
9+
; RUN: opt -passes='default<O3>' -rotation-max-header-size=2 -S < %s | FileCheck %s --check-prefixes=HOIST,THR2,FALLBACK5
1010

11-
; RUN: opt -O3 -rotation-max-header-size=3 -S < %s | FileCheck %s --check-prefixes=ROTATE,ROTATE_OLDPM,FALLBACK6
12-
; RUN: opt -passes='default<O3>' -rotation-max-header-size=3 -S < %s | FileCheck %s --check-prefixes=ROTATE,ROTATE_NEWPM,FALLBACK7
11+
; RUN: opt -O3 -rotation-max-header-size=3 -S < %s | FileCheck %s --check-prefixes=ROTATED_LATER,ROTATED_LATER_OLDPM,FALLBACK6
12+
; RUN: opt -passes='default<O3>' -rotation-max-header-size=3 -S < %s | FileCheck %s --check-prefixes=ROTATED_LATER,ROTATED_LATER_NEWPM,FALLBACK7
13+
14+
; RUN: opt -O3 -rotation-max-header-size=4 -S < %s | FileCheck %s --check-prefixes=ROTATE,ROTATE_OLDPM,FALLBACK8
15+
; RUN: opt -passes='default<O3>' -rotation-max-header-size=4 -S < %s | FileCheck %s --check-prefixes=ROTATE,ROTATE_NEWPM,FALLBACK9
1316

1417
; This example is produced from a very basic C code:
1518
;
@@ -58,8 +61,8 @@ define void @_Z4loopi(i32 %width) {
5861
; HOIST-NEXT: br label [[FOR_COND:%.*]]
5962
; HOIST: for.cond:
6063
; HOIST-NEXT: [[I_0:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY:%.*]] ], [ 0, [[FOR_COND_PREHEADER]] ]
61-
; HOIST-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[I_0]], [[TMP0]]
6264
; HOIST-NEXT: tail call void @f0()
65+
; HOIST-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[I_0]], [[TMP0]]
6366
; HOIST-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
6467
; HOIST: for.cond.cleanup:
6568
; HOIST-NEXT: tail call void @f2()
@@ -77,17 +80,17 @@ define void @_Z4loopi(i32 %width) {
7780
; ROTATED_LATER_OLDPM-NEXT: br i1 [[CMP]], label [[RETURN:%.*]], label [[FOR_COND_PREHEADER:%.*]]
7881
; ROTATED_LATER_OLDPM: for.cond.preheader:
7982
; ROTATED_LATER_OLDPM-NEXT: [[TMP0:%.*]] = add nsw i32 [[WIDTH]], -1
83+
; ROTATED_LATER_OLDPM-NEXT: tail call void @f0()
8084
; ROTATED_LATER_OLDPM-NEXT: [[EXITCOND_NOT3:%.*]] = icmp eq i32 [[TMP0]], 0
8185
; ROTATED_LATER_OLDPM-NEXT: br i1 [[EXITCOND_NOT3]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY:%.*]]
8286
; ROTATED_LATER_OLDPM: for.cond.cleanup:
83-
; ROTATED_LATER_OLDPM-NEXT: tail call void @f0()
8487
; ROTATED_LATER_OLDPM-NEXT: tail call void @f2()
8588
; ROTATED_LATER_OLDPM-NEXT: br label [[RETURN]]
8689
; ROTATED_LATER_OLDPM: for.body:
8790
; ROTATED_LATER_OLDPM-NEXT: [[I_04:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_COND_PREHEADER]] ]
88-
; ROTATED_LATER_OLDPM-NEXT: tail call void @f0()
8991
; ROTATED_LATER_OLDPM-NEXT: tail call void @f1()
9092
; ROTATED_LATER_OLDPM-NEXT: [[INC]] = add nuw i32 [[I_04]], 1
93+
; ROTATED_LATER_OLDPM-NEXT: tail call void @f0()
9194
; ROTATED_LATER_OLDPM-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[TMP0]]
9295
; ROTATED_LATER_OLDPM-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]]
9396
; ROTATED_LATER_OLDPM: return:
@@ -99,19 +102,19 @@ define void @_Z4loopi(i32 %width) {
99102
; ROTATED_LATER_NEWPM-NEXT: br i1 [[CMP]], label [[RETURN:%.*]], label [[FOR_COND_PREHEADER:%.*]]
100103
; ROTATED_LATER_NEWPM: for.cond.preheader:
101104
; ROTATED_LATER_NEWPM-NEXT: [[TMP0:%.*]] = add nsw i32 [[WIDTH]], -1
105+
; ROTATED_LATER_NEWPM-NEXT: tail call void @f0()
102106
; ROTATED_LATER_NEWPM-NEXT: [[EXITCOND_NOT3:%.*]] = icmp eq i32 [[TMP0]], 0
103107
; ROTATED_LATER_NEWPM-NEXT: br i1 [[EXITCOND_NOT3]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND_PREHEADER_FOR_BODY_CRIT_EDGE:%.*]]
104108
; ROTATED_LATER_NEWPM: for.cond.preheader.for.body_crit_edge:
105109
; ROTATED_LATER_NEWPM-NEXT: [[INC_1:%.*]] = add nuw i32 0, 1
106110
; ROTATED_LATER_NEWPM-NEXT: br label [[FOR_BODY:%.*]]
107111
; ROTATED_LATER_NEWPM: for.cond.cleanup:
108-
; ROTATED_LATER_NEWPM-NEXT: tail call void @f0()
109112
; ROTATED_LATER_NEWPM-NEXT: tail call void @f2()
110113
; ROTATED_LATER_NEWPM-NEXT: br label [[RETURN]]
111114
; ROTATED_LATER_NEWPM: for.body:
112115
; ROTATED_LATER_NEWPM-NEXT: [[INC_PHI:%.*]] = phi i32 [ [[INC_0:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE:%.*]] ], [ [[INC_1]], [[FOR_COND_PREHEADER_FOR_BODY_CRIT_EDGE]] ]
113-
; ROTATED_LATER_NEWPM-NEXT: tail call void @f0()
114116
; ROTATED_LATER_NEWPM-NEXT: tail call void @f1()
117+
; ROTATED_LATER_NEWPM-NEXT: tail call void @f0()
115118
; ROTATED_LATER_NEWPM-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC_PHI]], [[TMP0]]
116119
; ROTATED_LATER_NEWPM-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY_FOR_BODY_CRIT_EDGE]]
117120
; ROTATED_LATER_NEWPM: for.body.for.body_crit_edge:
@@ -126,19 +129,19 @@ define void @_Z4loopi(i32 %width) {
126129
; ROTATE_OLDPM-NEXT: br i1 [[CMP]], label [[RETURN:%.*]], label [[FOR_COND_PREHEADER:%.*]]
127130
; ROTATE_OLDPM: for.cond.preheader:
128131
; ROTATE_OLDPM-NEXT: [[CMP13_NOT:%.*]] = icmp eq i32 [[WIDTH]], 1
132+
; ROTATE_OLDPM-NEXT: tail call void @f0()
129133
; ROTATE_OLDPM-NEXT: br i1 [[CMP13_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
130134
; ROTATE_OLDPM: for.body.preheader:
131135
; ROTATE_OLDPM-NEXT: [[TMP0:%.*]] = add nsw i32 [[WIDTH]], -1
132136
; ROTATE_OLDPM-NEXT: br label [[FOR_BODY:%.*]]
133137
; ROTATE_OLDPM: for.cond.cleanup:
134-
; ROTATE_OLDPM-NEXT: tail call void @f0()
135138
; ROTATE_OLDPM-NEXT: tail call void @f2()
136139
; ROTATE_OLDPM-NEXT: br label [[RETURN]]
137140
; ROTATE_OLDPM: for.body:
138141
; ROTATE_OLDPM-NEXT: [[I_04:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
139-
; ROTATE_OLDPM-NEXT: tail call void @f0()
140142
; ROTATE_OLDPM-NEXT: tail call void @f1()
141143
; ROTATE_OLDPM-NEXT: [[INC]] = add nuw nsw i32 [[I_04]], 1
144+
; ROTATE_OLDPM-NEXT: tail call void @f0()
142145
; ROTATE_OLDPM-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[TMP0]]
143146
; ROTATE_OLDPM-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]]
144147
; ROTATE_OLDPM: return:
@@ -150,19 +153,19 @@ define void @_Z4loopi(i32 %width) {
150153
; ROTATE_NEWPM-NEXT: br i1 [[CMP]], label [[RETURN:%.*]], label [[FOR_COND_PREHEADER:%.*]]
151154
; ROTATE_NEWPM: for.cond.preheader:
152155
; ROTATE_NEWPM-NEXT: [[CMP13_NOT:%.*]] = icmp eq i32 [[WIDTH]], 1
156+
; ROTATE_NEWPM-NEXT: tail call void @f0()
153157
; ROTATE_NEWPM-NEXT: br i1 [[CMP13_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
154158
; ROTATE_NEWPM: for.body.preheader:
155159
; ROTATE_NEWPM-NEXT: [[TMP0:%.*]] = add nsw i32 [[WIDTH]], -1
156160
; ROTATE_NEWPM-NEXT: [[INC_1:%.*]] = add nuw nsw i32 0, 1
157161
; ROTATE_NEWPM-NEXT: br label [[FOR_BODY:%.*]]
158162
; ROTATE_NEWPM: for.cond.cleanup:
159-
; ROTATE_NEWPM-NEXT: tail call void @f0()
160163
; ROTATE_NEWPM-NEXT: tail call void @f2()
161164
; ROTATE_NEWPM-NEXT: br label [[RETURN]]
162165
; ROTATE_NEWPM: for.body:
163166
; ROTATE_NEWPM-NEXT: [[INC_PHI:%.*]] = phi i32 [ [[INC_0:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE:%.*]] ], [ [[INC_1]], [[FOR_BODY_PREHEADER]] ]
164-
; ROTATE_NEWPM-NEXT: tail call void @f0()
165167
; ROTATE_NEWPM-NEXT: tail call void @f1()
168+
; ROTATE_NEWPM-NEXT: tail call void @f0()
166169
; ROTATE_NEWPM-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC_PHI]], [[TMP0]]
167170
; ROTATE_NEWPM-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY_FOR_BODY_CRIT_EDGE]]
168171
; ROTATE_NEWPM: for.body.for.body_crit_edge:

llvm/test/Transforms/SimplifyCFG/common-code-hoisting.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
22
; RUN: opt -simplifycfg -hoist-common-insts=1 -S < %s | FileCheck %s --check-prefixes=HOIST
33
; RUN: opt -simplifycfg -hoist-common-insts=0 -S < %s | FileCheck %s --check-prefixes=NOHOIST
4-
; RUN: opt -simplifycfg -S < %s | FileCheck %s --check-prefixes=NOHOIST,DEFAULT
4+
; RUN: opt -simplifycfg -S < %s | FileCheck %s --check-prefixes=HOIST,DEFAULT
55

66
; This example is produced from a very basic C code:
77
;

0 commit comments

Comments
 (0)