Skip to content

Commit 8d3f497

Browse files
authored
[StructurizeCFG] Hoist and simplify zero-cost incoming else phi values (#139605)
The order of if and else blocks can introduce unnecessary VGPR copies. Consider the case of an if-else block where the incoming phi from the 'Else block' only contains zero-cost instructions, and the 'Then' block modifies some value. There would be no interference when coalescing because only one value is live at any point before structurization. However, in the structurized CFG, the Then value is live at 'Else' block due to the path if→flow→else, leading to additional VGPR copies. This patch addresses the issue by: - Identifying PHI nodes with zero-cost incoming values from the Else block and hoisting those values to the nearest common dominator of the Then and Else blocks. - Updating Flow PHI nodes by replacing poison entries (on the if→flow edge) with the correct hoisted values.
1 parent eaac713 commit 8d3f497

File tree

4 files changed

+466
-11
lines changed

4 files changed

+466
-11
lines changed

llvm/lib/Transforms/Scalar/StructurizeCFG.cpp

Lines changed: 122 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include "llvm/Analysis/RegionInfo.h"
2020
#include "llvm/Analysis/RegionIterator.h"
2121
#include "llvm/Analysis/RegionPass.h"
22+
#include "llvm/Analysis/TargetTransformInfo.h"
2223
#include "llvm/Analysis/UniformityAnalysis.h"
2324
#include "llvm/IR/BasicBlock.h"
2425
#include "llvm/IR/CFG.h"
@@ -128,6 +129,7 @@ struct PredInfo {
128129
using BBPredicates = DenseMap<BasicBlock *, PredInfo>;
129130
using PredMap = DenseMap<BasicBlock *, BBPredicates>;
130131
using BB2BBMap = DenseMap<BasicBlock *, BasicBlock *>;
132+
using Val2BBMap = DenseMap<Value *, BasicBlock *>;
131133

132134
// A traits type that is intended to be used in graph algorithms. The graph
133135
// traits starts at an entry node, and traverses the RegionNodes that are in
@@ -279,7 +281,7 @@ class StructurizeCFG {
279281
ConstantInt *BoolTrue;
280282
ConstantInt *BoolFalse;
281283
Value *BoolPoison;
282-
284+
const TargetTransformInfo *TTI;
283285
Function *Func;
284286
Region *ParentRegion;
285287

@@ -301,8 +303,12 @@ class StructurizeCFG {
301303
PredMap LoopPreds;
302304
BranchVector LoopConds;
303305

306+
Val2BBMap HoistedValues;
307+
304308
RegionNode *PrevNode;
305309

310+
void hoistZeroCostElseBlockPhiValues(BasicBlock *ElseBB, BasicBlock *ThenBB);
311+
306312
void orderNodes();
307313

308314
void analyzeLoops(RegionNode *N);
@@ -332,6 +338,8 @@ class StructurizeCFG {
332338

333339
void simplifyAffectedPhis();
334340

341+
void simplifyHoistedPhis();
342+
335343
DebugLoc killTerminator(BasicBlock *BB);
336344

337345
void changeExit(RegionNode *Node, BasicBlock *NewExit,
@@ -359,7 +367,7 @@ class StructurizeCFG {
359367

360368
public:
361369
void init(Region *R);
362-
bool run(Region *R, DominatorTree *DT);
370+
bool run(Region *R, DominatorTree *DT, const TargetTransformInfo *TTI);
363371
bool makeUniformRegion(Region *R, UniformityInfo &UA);
364372
};
365373

@@ -385,16 +393,21 @@ class StructurizeCFGLegacyPass : public RegionPass {
385393
if (SCFG.makeUniformRegion(R, UA))
386394
return false;
387395
}
396+
Function *F = R->getEntry()->getParent();
397+
const TargetTransformInfo *TTI =
398+
&getAnalysis<TargetTransformInfoWrapperPass>().getTTI(*F);
388399
DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
389-
return SCFG.run(R, DT);
400+
return SCFG.run(R, DT, TTI);
390401
}
391402

392403
StringRef getPassName() const override { return "Structurize control flow"; }
393404

394405
void getAnalysisUsage(AnalysisUsage &AU) const override {
395406
if (SkipUniformRegions)
396407
AU.addRequired<UniformityInfoWrapperPass>();
408+
AU.addRequired<TargetTransformInfoWrapperPass>();
397409
AU.addRequired<DominatorTreeWrapperPass>();
410+
AU.addRequired<TargetTransformInfoWrapperPass>();
398411

399412
AU.addPreserved<DominatorTreeWrapperPass>();
400413
RegionPass::getAnalysisUsage(AU);
@@ -403,6 +416,34 @@ class StructurizeCFGLegacyPass : public RegionPass {
403416

404417
} // end anonymous namespace
405418

419+
/// Checks whether an instruction is zero cost instruction and checks if the
420+
/// operands are from different BB. If so, this instruction can be coalesced
421+
/// if its hoisted to predecessor block. So, this returns true.
422+
static bool isHoistableInstruction(Instruction *I, BasicBlock *BB,
423+
const TargetTransformInfo *TTI) {
424+
if (I->getParent() != BB)
425+
return false;
426+
427+
// If the instruction is not a zero cost instruction, return false.
428+
auto Cost = TTI->getInstructionCost(I, TargetTransformInfo::TCK_Latency);
429+
InstructionCost::CostType CostVal =
430+
Cost.isValid()
431+
? Cost.getValue()
432+
: (InstructionCost::CostType)TargetTransformInfo::TCC_Expensive;
433+
if (CostVal != 0)
434+
return false;
435+
436+
// Check if any operands are instructions defined in the same block.
437+
for (auto &Op : I->operands()) {
438+
if (auto *OpI = dyn_cast<Instruction>(Op)) {
439+
if (OpI->getParent() == BB)
440+
return false;
441+
}
442+
}
443+
444+
return true;
445+
}
446+
406447
char StructurizeCFGLegacyPass::ID = 0;
407448

408449
INITIALIZE_PASS_BEGIN(StructurizeCFGLegacyPass, "structurizecfg",
@@ -413,6 +454,39 @@ INITIALIZE_PASS_DEPENDENCY(RegionInfoPass)
413454
INITIALIZE_PASS_END(StructurizeCFGLegacyPass, "structurizecfg",
414455
"Structurize the CFG", false, false)
415456

457+
/// Structurization can introduce unnecessary VGPR copies due to register
458+
/// coalescing interference. For example, if the Else block has a zero-cost
459+
/// instruction and the Then block modifies the VGPR value, only one value is
460+
/// live at a time in merge block before structurization. After structurization,
461+
/// the coalescer may incorrectly treat the Then value as live in the Else block
462+
/// (via the path Then → Flow → Else), leading to unnecessary VGPR copies.
463+
///
464+
/// This function examines phi nodes whose incoming values are zero-cost
465+
/// instructions in the Else block. It identifies such values that can be safely
466+
/// hoisted and moves them to the nearest common dominator of Then and Else
467+
/// blocks. A follow-up function after setting PhiNodes assigns the hoisted
468+
/// value to poison phi nodes along the if→flow edge, aiding register coalescing
469+
/// and minimizing unnecessary live ranges.
470+
void StructurizeCFG::hoistZeroCostElseBlockPhiValues(BasicBlock *ElseBB,
471+
BasicBlock *ThenBB) {
472+
473+
BasicBlock *ElseSucc = ElseBB->getSingleSuccessor();
474+
BasicBlock *CommonDominator = DT->findNearestCommonDominator(ElseBB, ThenBB);
475+
476+
if (!ElseSucc || !CommonDominator)
477+
return;
478+
Instruction *Term = CommonDominator->getTerminator();
479+
for (PHINode &Phi : ElseSucc->phis()) {
480+
Value *ElseVal = Phi.getIncomingValueForBlock(ElseBB);
481+
auto *Inst = dyn_cast<Instruction>(ElseVal);
482+
if (!Inst || !isHoistableInstruction(Inst, ElseBB, TTI))
483+
continue;
484+
Inst->removeFromParent();
485+
Inst->insertInto(CommonDominator, Term->getIterator());
486+
HoistedValues[Inst] = CommonDominator;
487+
}
488+
}
489+
416490
/// Build up the general order of nodes, by performing a topological sort of the
417491
/// parent region's nodes, while ensuring that there is no outer cycle node
418492
/// between any two inner cycle nodes.
@@ -535,7 +609,7 @@ void StructurizeCFG::gatherPredicates(RegionNode *N) {
535609
BasicBlock *Other = Term->getSuccessor(!i);
536610
if (Visited.count(Other) && !Loops.count(Other) &&
537611
!Pred.count(Other) && !Pred.count(P)) {
538-
612+
hoistZeroCostElseBlockPhiValues(Succ, Other);
539613
Pred[Other] = {BoolFalse, std::nullopt};
540614
Pred[P] = {BoolTrue, std::nullopt};
541615
continue;
@@ -891,6 +965,44 @@ void StructurizeCFG::setPhiValues() {
891965
AffectedPhis.append(InsertedPhis.begin(), InsertedPhis.end());
892966
}
893967

968+
/// Updates PHI nodes after hoisted zero cost instructions by replacing poison
969+
/// entries on Flow nodes with the appropriate hoisted values
970+
void StructurizeCFG::simplifyHoistedPhis() {
971+
for (WeakVH VH : AffectedPhis) {
972+
PHINode *Phi = dyn_cast_or_null<PHINode>(VH);
973+
if (!Phi || Phi->getNumIncomingValues() != 2)
974+
continue;
975+
976+
for (int i = 0; i < 2; i++) {
977+
Value *V = Phi->getIncomingValue(i);
978+
auto BBIt = HoistedValues.find(V);
979+
980+
if (BBIt == HoistedValues.end())
981+
continue;
982+
983+
Value *OtherV = Phi->getIncomingValue(!i);
984+
PHINode *OtherPhi = dyn_cast<PHINode>(OtherV);
985+
if (!OtherPhi)
986+
continue;
987+
988+
int PoisonValBBIdx = -1;
989+
for (size_t i = 0; i < OtherPhi->getNumIncomingValues(); i++) {
990+
if (!isa<PoisonValue>(OtherPhi->getIncomingValue(i)))
991+
continue;
992+
PoisonValBBIdx = i;
993+
break;
994+
}
995+
if (PoisonValBBIdx == -1 ||
996+
!DT->dominates(BBIt->second,
997+
OtherPhi->getIncomingBlock(PoisonValBBIdx)))
998+
continue;
999+
1000+
OtherPhi->setIncomingValue(PoisonValBBIdx, V);
1001+
Phi->setIncomingValue(i, OtherV);
1002+
}
1003+
}
1004+
}
1005+
8941006
void StructurizeCFG::simplifyAffectedPhis() {
8951007
bool Changed;
8961008
do {
@@ -1283,12 +1395,13 @@ bool StructurizeCFG::makeUniformRegion(Region *R, UniformityInfo &UA) {
12831395
}
12841396

12851397
/// Run the transformation for each region found
1286-
bool StructurizeCFG::run(Region *R, DominatorTree *DT) {
1398+
bool StructurizeCFG::run(Region *R, DominatorTree *DT,
1399+
const TargetTransformInfo *TTI) {
12871400
if (R->isTopLevelRegion())
12881401
return false;
12891402

12901403
this->DT = DT;
1291-
1404+
this->TTI = TTI;
12921405
Func = R->getEntry()->getParent();
12931406
assert(hasOnlySimpleTerminator(*Func) && "Unsupported block terminator.");
12941407

@@ -1300,6 +1413,7 @@ bool StructurizeCFG::run(Region *R, DominatorTree *DT) {
13001413
insertConditions(false);
13011414
insertConditions(true);
13021415
setPhiValues();
1416+
simplifyHoistedPhis();
13031417
simplifyConditions();
13041418
simplifyAffectedPhis();
13051419
rebuildSSA();
@@ -1349,7 +1463,7 @@ PreservedAnalyses StructurizeCFGPass::run(Function &F,
13491463
bool Changed = false;
13501464
DominatorTree *DT = &AM.getResult<DominatorTreeAnalysis>(F);
13511465
auto &RI = AM.getResult<RegionInfoAnalysis>(F);
1352-
1466+
TargetTransformInfo *TTI = &AM.getResult<TargetIRAnalysis>(F);
13531467
UniformityInfo *UI = nullptr;
13541468
if (SkipUniformRegions)
13551469
UI = &AM.getResult<UniformityInfoAnalysis>(F);
@@ -1368,7 +1482,7 @@ PreservedAnalyses StructurizeCFGPass::run(Function &F,
13681482
continue;
13691483
}
13701484

1371-
Changed |= SCFG.run(R, DT);
1485+
Changed |= SCFG.run(R, DT, TTI);
13721486
}
13731487
if (!Changed)
13741488
return PreservedAnalyses::all();

llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9851,8 +9851,8 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5
98519851
; CHECK-NEXT: s_andn2_saveexec_b32 s6, s6
98529852
; CHECK-NEXT: s_cbranch_execz .LBB8_6
98539853
; CHECK-NEXT: ; %bb.4: ; %memmove_bwd_loop.preheader
9854-
; CHECK-NEXT: v_add_nc_u32_e32 v0, 0x700, v0
98559854
; CHECK-NEXT: v_add_nc_u32_e32 v1, 0x700, v1
9855+
; CHECK-NEXT: v_add_nc_u32_e32 v0, 0x700, v0
98569856
; CHECK-NEXT: s_movk_i32 s4, 0xf800
98579857
; CHECK-NEXT: s_mov_b32 s5, -1
98589858
; CHECK-NEXT: .LBB8_5: ; %memmove_bwd_loop
@@ -11167,8 +11167,8 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5
1116711167
; ALIGNED-NEXT: s_andn2_saveexec_b32 s6, s6
1116811168
; ALIGNED-NEXT: s_cbranch_execz .LBB8_6
1116911169
; ALIGNED-NEXT: ; %bb.4: ; %memmove_bwd_loop.preheader
11170-
; ALIGNED-NEXT: v_add_nc_u32_e32 v0, 0x700, v0
1117111170
; ALIGNED-NEXT: v_add_nc_u32_e32 v1, 0x700, v1
11171+
; ALIGNED-NEXT: v_add_nc_u32_e32 v0, 0x700, v0
1117211172
; ALIGNED-NEXT: s_movk_i32 s4, 0xf800
1117311173
; ALIGNED-NEXT: s_mov_b32 s5, -1
1117411174
; ALIGNED-NEXT: .LBB8_5: ; %memmove_bwd_loop
@@ -12381,8 +12381,8 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5
1238112381
; UNROLL3-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:2024
1238212382
; UNROLL3-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:2020
1238312383
; UNROLL3-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:2016
12384-
; UNROLL3-NEXT: v_add_nc_u32_e32 v2, 0x7b0, v0
1238512384
; UNROLL3-NEXT: v_add_nc_u32_e32 v1, 0x7b0, v1
12385+
; UNROLL3-NEXT: v_add_nc_u32_e32 v2, 0x7b0, v0
1238612386
; UNROLL3-NEXT: s_waitcnt vmcnt(3)
1238712387
; UNROLL3-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:2028
1238812388
; UNROLL3-NEXT: s_waitcnt vmcnt(2)

0 commit comments

Comments
 (0)