Skip to content

Commit 64686c5

Browse files
authored
[VPlan] Connect (MemRuntime|SCEV)Check blocks as VPlan transform (NFC). (#143879)
Connect SCEV and memory runtime check block directly in VPlan as VPIRBasicBlocks, removing ILV::emitSCEVChecks and ILV::emitMemRuntimeChecks. The new logic is currently split across LoopVectorizationPlanner::addRuntimeChecks which collects a list of {Condition, CheckBlock} pairs and performs some checks and emits remarks if needed. The list of checks is then added to VPlan in VPlanTransforms::connectCheckBlocks. PR: #143879
1 parent ba6f872 commit 64686c5

File tree

6 files changed

+182
-187
lines changed

6 files changed

+182
-187
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,10 @@
2828
#include "llvm/ADT/SmallSet.h"
2929
#include "llvm/Support/InstructionCost.h"
3030

31+
namespace {
32+
class GeneratedRTChecks;
33+
}
34+
3135
namespace llvm {
3236

3337
class LoopInfo;
@@ -554,6 +558,10 @@ class LoopVectorizationPlanner {
554558
VPRecipeBuilder &RecipeBuilder,
555559
ElementCount MinVF);
556560

561+
/// Attach the runtime checks of \p RTChecks to \p Plan.
562+
void attachRuntimeChecks(VPlan &Plan, GeneratedRTChecks &RTChecks,
563+
bool HasBranchWeights) const;
564+
557565
#ifndef NDEBUG
558566
/// \return The most profitable vectorization factor for the available VPlans
559567
/// and the cost of that VF.

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 84 additions & 152 deletions
Original file line numberDiff line numberDiff line change
@@ -399,12 +399,6 @@ static cl::opt<bool> EnableEarlyExitVectorization(
399399
cl::desc(
400400
"Enable vectorization of early exit loops with uncountable exits."));
401401

402-
// Likelyhood of bypassing the vectorized loop because assumptions about SCEV
403-
// variables not overflowing do not hold. See `emitSCEVChecks`.
404-
static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127};
405-
// Likelyhood of bypassing the vectorized loop because pointers overlap. See
406-
// `emitMemRuntimeChecks`.
407-
static constexpr uint32_t MemCheckBypassWeights[] = {1, 127};
408402
// Likelyhood of bypassing the vectorized loop because there are zero trips left
409403
// after prolog. See `emitIterationCountCheck`.
410404
static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
@@ -544,16 +538,6 @@ class InnerLoopVectorizer {
544538
/// it overflows.
545539
void emitIterationCountCheck(BasicBlock *Bypass);
546540

547-
/// Emit a bypass check to see if all of the SCEV assumptions we've
548-
/// had to make are correct. Returns the block containing the checks or
549-
/// nullptr if no checks have been added.
550-
BasicBlock *emitSCEVChecks(BasicBlock *Bypass);
551-
552-
/// Emit bypass checks to check any memory assumptions we may have made.
553-
/// Returns the block containing the checks or nullptr if no checks have been
554-
/// added.
555-
BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass);
556-
557541
/// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
558542
/// vector loop preheader, middle block and scalar preheader.
559543
void createVectorLoopSkeleton(StringRef Prefix);
@@ -657,8 +641,6 @@ struct EpilogueLoopVectorizationInfo {
657641
unsigned EpilogueUF = 0;
658642
BasicBlock *MainLoopIterationCountCheck = nullptr;
659643
BasicBlock *EpilogueIterationCountCheck = nullptr;
660-
BasicBlock *SCEVSafetyCheck = nullptr;
661-
BasicBlock *MemSafetyCheck = nullptr;
662644
Value *TripCount = nullptr;
663645
Value *VectorTripCount = nullptr;
664646
VPlan &EpiloguePlan;
@@ -1786,7 +1768,6 @@ class GeneratedRTChecks {
17861768
SCEVExpander MemCheckExp;
17871769

17881770
bool CostTooHigh = false;
1789-
const bool AddBranchWeights;
17901771

17911772
Loop *OuterLoop = nullptr;
17921773

@@ -1798,11 +1779,10 @@ class GeneratedRTChecks {
17981779
public:
17991780
GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT,
18001781
LoopInfo *LI, TargetTransformInfo *TTI,
1801-
const DataLayout &DL, bool AddBranchWeights,
1802-
TTI::TargetCostKind CostKind)
1782+
const DataLayout &DL, TTI::TargetCostKind CostKind)
18031783
: DT(DT), LI(LI), TTI(TTI), SCEVExp(*PSE.getSE(), DL, "scev.check"),
1804-
MemCheckExp(*PSE.getSE(), DL, "scev.check"),
1805-
AddBranchWeights(AddBranchWeights), PSE(PSE), CostKind(CostKind) {}
1784+
MemCheckExp(*PSE.getSE(), DL, "scev.check"), PSE(PSE),
1785+
CostKind(CostKind) {}
18061786

18071787
/// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
18081788
/// accurately estimate the cost of the runtime checks. The blocks are
@@ -2019,56 +1999,20 @@ class GeneratedRTChecks {
20191999
MemCheckBlock->eraseFromParent();
20202000
}
20212001

2022-
/// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2023-
/// adjusts the branches to branch to the vector preheader or \p Bypass,
2024-
/// depending on the generated condition.
2025-
BasicBlock *emitSCEVChecks(BasicBlock *Bypass,
2026-
BasicBlock *LoopVectorPreHeader) {
2002+
/// Retrieves the SCEVCheckCond and SCEVCheckBlock that were generated as IR
2003+
/// outside VPlan.
2004+
std::pair<Value *, BasicBlock *> getSCEVChecks() {
20272005
using namespace llvm::PatternMatch;
20282006
if (!SCEVCheckCond || match(SCEVCheckCond, m_ZeroInt()))
2029-
return nullptr;
2030-
2031-
auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2032-
BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2033-
2034-
SCEVCheckBlock->getTerminator()->eraseFromParent();
2035-
SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2036-
Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2037-
SCEVCheckBlock);
2038-
2039-
BranchInst &BI =
2040-
*BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond);
2041-
if (AddBranchWeights)
2042-
setBranchWeights(BI, SCEVCheckBypassWeights, /*IsExpected=*/false);
2043-
ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), &BI);
2044-
return SCEVCheckBlock;
2045-
}
2046-
2047-
/// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2048-
/// the branches to branch to the vector preheader or \p Bypass, depending on
2049-
/// the generated condition.
2050-
BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass,
2051-
BasicBlock *LoopVectorPreHeader) {
2052-
// Check if we generated code that checks in runtime if arrays overlap.
2053-
if (!MemRuntimeCheckCond)
2054-
return nullptr;
2055-
2056-
auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2057-
Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2058-
MemCheckBlock);
2007+
return {nullptr, nullptr};
20592008

2060-
MemCheckBlock->moveBefore(LoopVectorPreHeader);
2061-
2062-
BranchInst &BI =
2063-
*BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond);
2064-
if (AddBranchWeights) {
2065-
setBranchWeights(BI, MemCheckBypassWeights, /*IsExpected=*/false);
2066-
}
2067-
ReplaceInstWithInst(MemCheckBlock->getTerminator(), &BI);
2068-
MemCheckBlock->getTerminator()->setDebugLoc(
2069-
Pred->getTerminator()->getDebugLoc());
2009+
return {SCEVCheckCond, SCEVCheckBlock};
2010+
}
20702011

2071-
return MemCheckBlock;
2012+
/// Retrieves the MemCheckCond and MemCheckBlock that were generated as IR
2013+
/// outside VPlan.
2014+
std::pair<Value *, BasicBlock *> getMemRuntimeChecks() {
2015+
return {MemRuntimeCheckCond, MemCheckBlock};
20722016
}
20732017

20742018
/// Return true if any runtime checks have been added
@@ -2461,53 +2405,6 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
24612405
"Plan's entry must be TCCCheckBlock");
24622406
}
24632407

2464-
BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
2465-
BasicBlock *const SCEVCheckBlock =
2466-
RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader);
2467-
if (!SCEVCheckBlock)
2468-
return nullptr;
2469-
2470-
assert((!Cost->OptForSize ||
2471-
Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled) &&
2472-
"Cannot SCEV check stride or overflow when optimizing for size");
2473-
2474-
introduceCheckBlockInVPlan(SCEVCheckBlock);
2475-
return SCEVCheckBlock;
2476-
}
2477-
2478-
BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
2479-
BasicBlock *const MemCheckBlock =
2480-
RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader);
2481-
2482-
// Check if we generated code that checks in runtime if arrays overlap. We put
2483-
// the checks into a separate block to make the more common case of few
2484-
// elements faster.
2485-
if (!MemCheckBlock)
2486-
return nullptr;
2487-
2488-
// VPlan-native path does not do any analysis for runtime checks currently.
2489-
assert((!EnableVPlanNativePath || OrigLoop->begin() == OrigLoop->end()) &&
2490-
"Runtime checks are not supported for outer loops yet");
2491-
2492-
if (Cost->OptForSize) {
2493-
assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2494-
"Cannot emit memory checks when optimizing for size, unless forced "
2495-
"to vectorize.");
2496-
ORE->emit([&]() {
2497-
return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2498-
OrigLoop->getStartLoc(),
2499-
OrigLoop->getHeader())
2500-
<< "Code-size may be reduced by not forcing "
2501-
"vectorization, or by source-code modifications "
2502-
"eliminating the need for runtime checks "
2503-
"(e.g., adding 'restrict').";
2504-
});
2505-
}
2506-
2507-
introduceCheckBlockInVPlan(MemCheckBlock);
2508-
return MemCheckBlock;
2509-
}
2510-
25112408
/// Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p
25122409
/// VPBB are moved to the end of the newly created VPIRBasicBlock. VPBB must
25132410
/// have a single predecessor, which is rewired to the new VPIRBasicBlock. All
@@ -2624,15 +2521,6 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
26242521
// to the scalar loop.
26252522
emitIterationCountCheck(LoopScalarPreHeader);
26262523

2627-
// Generate the code to check any assumptions that we've made for SCEV
2628-
// expressions.
2629-
emitSCEVChecks(LoopScalarPreHeader);
2630-
2631-
// Generate the code that checks in runtime if arrays overlap. We put the
2632-
// checks into a separate block to make the more common case of few elements
2633-
// faster.
2634-
emitMemRuntimeChecks(LoopScalarPreHeader);
2635-
26362524
replaceVPBBWithIRVPBB(Plan.getScalarPreheader(), LoopScalarPreHeader);
26372525
return LoopVectorPreHeader;
26382526
}
@@ -7323,11 +7211,22 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
73237211
OrigLoop->getHeader()->getContext());
73247212
VPlanTransforms::runPass(VPlanTransforms::replicateByVF, BestVPlan, BestVF);
73257213
VPlanTransforms::runPass(VPlanTransforms::materializeBroadcasts, BestVPlan);
7326-
if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
7214+
bool HasBranchWeights =
7215+
hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator());
7216+
if (HasBranchWeights) {
73277217
std::optional<unsigned> VScale = CM.getVScaleForTuning();
73287218
VPlanTransforms::runPass(VPlanTransforms::addBranchWeightToMiddleTerminator,
73297219
BestVPlan, BestVF, VScale);
73307220
}
7221+
7222+
if (!VectorizingEpilogue) {
7223+
// Checks are the same for all VPlans, added to BestVPlan only for
7224+
// compactness.
7225+
attachRuntimeChecks(BestVPlan, ILV.RTChecks, HasBranchWeights);
7226+
}
7227+
7228+
// Retrieving VectorPH now when it's easier while VPlan still has Regions.
7229+
VPBasicBlock *VectorPH = cast<VPBasicBlock>(BestVPlan.getVectorPreheader());
73317230
VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
73327231
VPlanTransforms::simplifyRecipes(BestVPlan, *Legal->getWidestInductionType());
73337232
VPlanTransforms::narrowInterleaveGroups(
@@ -7375,7 +7274,8 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
73757274

73767275
// 1. Set up the skeleton for vectorization, including vector pre-header and
73777276
// middle block. The vector loop is created during VPlan execution.
7378-
VPBasicBlock *VectorPH = cast<VPBasicBlock>(Entry->getSuccessors()[1]);
7277+
BasicBlock *EntryBB =
7278+
cast<VPIRBasicBlock>(BestVPlan.getEntry())->getIRBasicBlock();
73797279
State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
73807280
if (VectorizingEpilogue)
73817281
VPlanTransforms::removeDeadRecipes(BestVPlan);
@@ -7399,6 +7299,13 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
73997299
ILV.getOrCreateVectorTripCount(ILV.LoopVectorPreHeader), State);
74007300
replaceVPBBWithIRVPBB(VectorPH, State.CFG.PrevBB);
74017301

7302+
// Move check blocks to their final position.
7303+
// TODO: Move as part of VPIRBB execute and update impacted tests.
7304+
if (BasicBlock *MemCheckBlock = ILV.RTChecks.getMemRuntimeChecks().second)
7305+
MemCheckBlock->moveAfter(EntryBB);
7306+
if (BasicBlock *SCEVCheckBlock = ILV.RTChecks.getSCEVChecks().second)
7307+
SCEVCheckBlock->moveAfter(EntryBB);
7308+
74027309
BestVPlan.execute(&State);
74037310

74047311
// 2.5 When vectorizing the epilogue, fix reduction resume values from the
@@ -7499,15 +7406,6 @@ BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
74997406
emitIterationCountCheck(LoopScalarPreHeader, true);
75007407
EPI.EpilogueIterationCountCheck->setName("iter.check");
75017408

7502-
// Generate the code to check any assumptions that we've made for SCEV
7503-
// expressions.
7504-
EPI.SCEVSafetyCheck = emitSCEVChecks(LoopScalarPreHeader);
7505-
7506-
// Generate the code that checks at runtime if arrays overlap. We put the
7507-
// checks into a separate block to make the more common case of few elements
7508-
// faster.
7509-
EPI.MemSafetyCheck = emitMemRuntimeChecks(LoopScalarPreHeader);
7510-
75117409
// Generate the iteration count check for the main loop, *after* the check
75127410
// for the epilogue loop, so that the path-length is shorter for the case
75137411
// that goes directly through the vector epilogue. The longer-path length for
@@ -7611,11 +7509,14 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
76117509
EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
76127510
VecEpilogueIterationCountCheck, LoopScalarPreHeader);
76137511

7614-
if (EPI.SCEVSafetyCheck)
7615-
EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
7512+
// Adjust the terminators of runtime check blocks and phis using them.
7513+
BasicBlock *SCEVCheckBlock = RTChecks.getSCEVChecks().second;
7514+
BasicBlock *MemCheckBlock = RTChecks.getMemRuntimeChecks().second;
7515+
if (SCEVCheckBlock)
7516+
SCEVCheckBlock->getTerminator()->replaceUsesOfWith(
76167517
VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7617-
if (EPI.MemSafetyCheck)
7618-
EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
7518+
if (MemCheckBlock)
7519+
MemCheckBlock->getTerminator()->replaceUsesOfWith(
76197520
VecEpilogueIterationCountCheck, LoopScalarPreHeader);
76207521

76217522
DT->changeImmediateDominator(LoopScalarPreHeader,
@@ -7642,10 +7543,10 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
76427543
}))
76437544
continue;
76447545
Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
7645-
if (EPI.SCEVSafetyCheck)
7646-
Phi->removeIncomingValue(EPI.SCEVSafetyCheck);
7647-
if (EPI.MemSafetyCheck)
7648-
Phi->removeIncomingValue(EPI.MemSafetyCheck);
7546+
if (SCEVCheckBlock)
7547+
Phi->removeIncomingValue(SCEVCheckBlock);
7548+
if (MemCheckBlock)
7549+
Phi->removeIncomingValue(MemCheckBlock);
76497550
}
76507551

76517552
replaceVPBBWithIRVPBB(Plan.getScalarPreheader(), LoopScalarPreHeader);
@@ -9380,6 +9281,43 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
93809281
VPlanTransforms::runPass(VPlanTransforms::clearReductionWrapFlags, *Plan);
93819282
}
93829283

9284+
void LoopVectorizationPlanner::attachRuntimeChecks(
9285+
VPlan &Plan, GeneratedRTChecks &RTChecks, bool HasBranchWeights) const {
9286+
const auto &[SCEVCheckCond, SCEVCheckBlock] = RTChecks.getSCEVChecks();
9287+
if (SCEVCheckBlock) {
9288+
assert((!CM.OptForSize ||
9289+
CM.Hints->getForce() == LoopVectorizeHints::FK_Enabled) &&
9290+
"Cannot SCEV check stride or overflow when optimizing for size");
9291+
VPlanTransforms::attachCheckBlock(Plan, SCEVCheckCond, SCEVCheckBlock,
9292+
HasBranchWeights);
9293+
}
9294+
const auto &[MemCheckCond, MemCheckBlock] = RTChecks.getMemRuntimeChecks();
9295+
if (MemCheckBlock) {
9296+
// VPlan-native path does not do any analysis for runtime checks
9297+
// currently.
9298+
assert((!EnableVPlanNativePath || OrigLoop->isInnermost()) &&
9299+
"Runtime checks are not supported for outer loops yet");
9300+
9301+
if (CM.OptForSize) {
9302+
assert(
9303+
CM.Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
9304+
"Cannot emit memory checks when optimizing for size, unless forced "
9305+
"to vectorize.");
9306+
ORE->emit([&]() {
9307+
return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
9308+
OrigLoop->getStartLoc(),
9309+
OrigLoop->getHeader())
9310+
<< "Code-size may be reduced by not forcing "
9311+
"vectorization, or by source-code modifications "
9312+
"eliminating the need for runtime checks "
9313+
"(e.g., adding 'restrict').";
9314+
});
9315+
}
9316+
VPlanTransforms::attachCheckBlock(Plan, MemCheckCond, MemCheckBlock,
9317+
HasBranchWeights);
9318+
}
9319+
}
9320+
93839321
void VPDerivedIVRecipe::execute(VPTransformState &State) {
93849322
assert(!State.Lane && "VPDerivedIVRecipe being replicated.");
93859323

@@ -9501,10 +9439,7 @@ static bool processLoopInVPlanNativePath(
95019439
VPlan &BestPlan = LVP.getPlanFor(VF.Width);
95029440

95039441
{
9504-
bool AddBranchWeights =
9505-
hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
9506-
GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(),
9507-
AddBranchWeights, CM.CostKind);
9442+
GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), CM.CostKind);
95089443
InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
95099444
VF.Width, 1, &CM, BFI, PSI, Checks, BestPlan);
95109445
LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
@@ -10142,10 +10077,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1014210077
if (ORE->allowExtraAnalysis(LV_NAME))
1014310078
LVP.emitInvalidCostRemarks(ORE);
1014410079

10145-
bool AddBranchWeights =
10146-
hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
10147-
GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(),
10148-
AddBranchWeights, CM.CostKind);
10080+
GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), CM.CostKind);
1014910081
if (LVP.hasPlanWithVF(VF.Width)) {
1015010082
// Select the interleave count.
1015110083
IC = CM.selectInterleaveCount(LVP.getPlanFor(VF.Width), VF.Width, VF.Cost);

0 commit comments

Comments
 (0)