@@ -399,12 +399,6 @@ static cl::opt<bool> EnableEarlyExitVectorization(
399
399
cl::desc(
400
400
" Enable vectorization of early exit loops with uncountable exits." ));
401
401
402
- // Likelyhood of bypassing the vectorized loop because assumptions about SCEV
403
- // variables not overflowing do not hold. See `emitSCEVChecks`.
404
- static constexpr uint32_t SCEVCheckBypassWeights[] = {1 , 127 };
405
- // Likelyhood of bypassing the vectorized loop because pointers overlap. See
406
- // `emitMemRuntimeChecks`.
407
- static constexpr uint32_t MemCheckBypassWeights[] = {1 , 127 };
408
402
// Likelyhood of bypassing the vectorized loop because there are zero trips left
409
403
// after prolog. See `emitIterationCountCheck`.
410
404
static constexpr uint32_t MinItersBypassWeights[] = {1 , 127 };
@@ -544,16 +538,6 @@ class InnerLoopVectorizer {
544
538
// / it overflows.
545
539
void emitIterationCountCheck (BasicBlock *Bypass);
546
540
547
- // / Emit a bypass check to see if all of the SCEV assumptions we've
548
- // / had to make are correct. Returns the block containing the checks or
549
- // / nullptr if no checks have been added.
550
- BasicBlock *emitSCEVChecks (BasicBlock *Bypass);
551
-
552
- // / Emit bypass checks to check any memory assumptions we may have made.
553
- // / Returns the block containing the checks or nullptr if no checks have been
554
- // / added.
555
- BasicBlock *emitMemRuntimeChecks (BasicBlock *Bypass);
556
-
557
541
// / Emit basic blocks (prefixed with \p Prefix) for the iteration check,
558
542
// / vector loop preheader, middle block and scalar preheader.
559
543
void createVectorLoopSkeleton (StringRef Prefix);
@@ -657,8 +641,6 @@ struct EpilogueLoopVectorizationInfo {
657
641
unsigned EpilogueUF = 0 ;
658
642
BasicBlock *MainLoopIterationCountCheck = nullptr ;
659
643
BasicBlock *EpilogueIterationCountCheck = nullptr ;
660
- BasicBlock *SCEVSafetyCheck = nullptr ;
661
- BasicBlock *MemSafetyCheck = nullptr ;
662
644
Value *TripCount = nullptr ;
663
645
Value *VectorTripCount = nullptr ;
664
646
VPlan &EpiloguePlan;
@@ -1786,7 +1768,6 @@ class GeneratedRTChecks {
1786
1768
SCEVExpander MemCheckExp;
1787
1769
1788
1770
bool CostTooHigh = false ;
1789
- const bool AddBranchWeights;
1790
1771
1791
1772
Loop *OuterLoop = nullptr ;
1792
1773
@@ -1798,11 +1779,10 @@ class GeneratedRTChecks {
1798
1779
public:
1799
1780
GeneratedRTChecks (PredicatedScalarEvolution &PSE, DominatorTree *DT,
1800
1781
LoopInfo *LI, TargetTransformInfo *TTI,
1801
- const DataLayout &DL, bool AddBranchWeights,
1802
- TTI::TargetCostKind CostKind)
1782
+ const DataLayout &DL, TTI::TargetCostKind CostKind)
1803
1783
: DT(DT), LI(LI), TTI(TTI), SCEVExp(*PSE.getSE(), DL, " scev.check" ),
1804
- MemCheckExp (*PSE.getSE(), DL, "scev.check"),
1805
- AddBranchWeights(AddBranchWeights), PSE(PSE), CostKind(CostKind) {}
1784
+ MemCheckExp (*PSE.getSE(), DL, "scev.check"), PSE(PSE),
1785
+ CostKind(CostKind) {}
1806
1786
1807
1787
// / Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1808
1788
// / accurately estimate the cost of the runtime checks. The blocks are
@@ -2019,56 +1999,20 @@ class GeneratedRTChecks {
2019
1999
MemCheckBlock->eraseFromParent ();
2020
2000
}
2021
2001
2022
- // / Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2023
- // / adjusts the branches to branch to the vector preheader or \p Bypass,
2024
- // / depending on the generated condition.
2025
- BasicBlock *emitSCEVChecks (BasicBlock *Bypass,
2026
- BasicBlock *LoopVectorPreHeader) {
2002
+ // / Retrieves the SCEVCheckCond and SCEVCheckBlock that were generated as IR
2003
+ // / outside VPlan.
2004
+ std::pair<Value *, BasicBlock *> getSCEVChecks () {
2027
2005
using namespace llvm ::PatternMatch;
2028
2006
if (!SCEVCheckCond || match (SCEVCheckCond, m_ZeroInt ()))
2029
- return nullptr ;
2030
-
2031
- auto *Pred = LoopVectorPreHeader->getSinglePredecessor ();
2032
- BranchInst::Create (LoopVectorPreHeader, SCEVCheckBlock);
2033
-
2034
- SCEVCheckBlock->getTerminator ()->eraseFromParent ();
2035
- SCEVCheckBlock->moveBefore (LoopVectorPreHeader);
2036
- Pred->getTerminator ()->replaceSuccessorWith (LoopVectorPreHeader,
2037
- SCEVCheckBlock);
2038
-
2039
- BranchInst &BI =
2040
- *BranchInst::Create (Bypass, LoopVectorPreHeader, SCEVCheckCond);
2041
- if (AddBranchWeights)
2042
- setBranchWeights (BI, SCEVCheckBypassWeights, /* IsExpected=*/ false );
2043
- ReplaceInstWithInst (SCEVCheckBlock->getTerminator (), &BI);
2044
- return SCEVCheckBlock;
2045
- }
2046
-
2047
- // / Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2048
- // / the branches to branch to the vector preheader or \p Bypass, depending on
2049
- // / the generated condition.
2050
- BasicBlock *emitMemRuntimeChecks (BasicBlock *Bypass,
2051
- BasicBlock *LoopVectorPreHeader) {
2052
- // Check if we generated code that checks in runtime if arrays overlap.
2053
- if (!MemRuntimeCheckCond)
2054
- return nullptr ;
2055
-
2056
- auto *Pred = LoopVectorPreHeader->getSinglePredecessor ();
2057
- Pred->getTerminator ()->replaceSuccessorWith (LoopVectorPreHeader,
2058
- MemCheckBlock);
2007
+ return {nullptr , nullptr };
2059
2008
2060
- MemCheckBlock->moveBefore (LoopVectorPreHeader);
2061
-
2062
- BranchInst &BI =
2063
- *BranchInst::Create (Bypass, LoopVectorPreHeader, MemRuntimeCheckCond);
2064
- if (AddBranchWeights) {
2065
- setBranchWeights (BI, MemCheckBypassWeights, /* IsExpected=*/ false );
2066
- }
2067
- ReplaceInstWithInst (MemCheckBlock->getTerminator (), &BI);
2068
- MemCheckBlock->getTerminator ()->setDebugLoc (
2069
- Pred->getTerminator ()->getDebugLoc ());
2009
+ return {SCEVCheckCond, SCEVCheckBlock};
2010
+ }
2070
2011
2071
- return MemCheckBlock;
2012
+ // / Retrieves the MemCheckCond and MemCheckBlock that were generated as IR
2013
+ // / outside VPlan.
2014
+ std::pair<Value *, BasicBlock *> getMemRuntimeChecks () {
2015
+ return {MemRuntimeCheckCond, MemCheckBlock};
2072
2016
}
2073
2017
2074
2018
// / Return true if any runtime checks have been added
@@ -2461,53 +2405,6 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
2461
2405
" Plan's entry must be TCCCheckBlock" );
2462
2406
}
2463
2407
2464
- BasicBlock *InnerLoopVectorizer::emitSCEVChecks (BasicBlock *Bypass) {
2465
- BasicBlock *const SCEVCheckBlock =
2466
- RTChecks.emitSCEVChecks (Bypass, LoopVectorPreHeader);
2467
- if (!SCEVCheckBlock)
2468
- return nullptr ;
2469
-
2470
- assert ((!Cost->OptForSize ||
2471
- Cost->Hints ->getForce () == LoopVectorizeHints::FK_Enabled) &&
2472
- " Cannot SCEV check stride or overflow when optimizing for size" );
2473
-
2474
- introduceCheckBlockInVPlan (SCEVCheckBlock);
2475
- return SCEVCheckBlock;
2476
- }
2477
-
2478
- BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks (BasicBlock *Bypass) {
2479
- BasicBlock *const MemCheckBlock =
2480
- RTChecks.emitMemRuntimeChecks (Bypass, LoopVectorPreHeader);
2481
-
2482
- // Check if we generated code that checks in runtime if arrays overlap. We put
2483
- // the checks into a separate block to make the more common case of few
2484
- // elements faster.
2485
- if (!MemCheckBlock)
2486
- return nullptr ;
2487
-
2488
- // VPlan-native path does not do any analysis for runtime checks currently.
2489
- assert ((!EnableVPlanNativePath || OrigLoop->begin () == OrigLoop->end ()) &&
2490
- " Runtime checks are not supported for outer loops yet" );
2491
-
2492
- if (Cost->OptForSize ) {
2493
- assert (Cost->Hints ->getForce () == LoopVectorizeHints::FK_Enabled &&
2494
- " Cannot emit memory checks when optimizing for size, unless forced "
2495
- " to vectorize." );
2496
- ORE->emit ([&]() {
2497
- return OptimizationRemarkAnalysis (DEBUG_TYPE, " VectorizationCodeSize" ,
2498
- OrigLoop->getStartLoc (),
2499
- OrigLoop->getHeader ())
2500
- << " Code-size may be reduced by not forcing "
2501
- " vectorization, or by source-code modifications "
2502
- " eliminating the need for runtime checks "
2503
- " (e.g., adding 'restrict')." ;
2504
- });
2505
- }
2506
-
2507
- introduceCheckBlockInVPlan (MemCheckBlock);
2508
- return MemCheckBlock;
2509
- }
2510
-
2511
2408
// / Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p
2512
2409
// / VPBB are moved to the end of the newly created VPIRBasicBlock. VPBB must
2513
2410
// / have a single predecessor, which is rewired to the new VPIRBasicBlock. All
@@ -2624,15 +2521,6 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
2624
2521
// to the scalar loop.
2625
2522
emitIterationCountCheck (LoopScalarPreHeader);
2626
2523
2627
- // Generate the code to check any assumptions that we've made for SCEV
2628
- // expressions.
2629
- emitSCEVChecks (LoopScalarPreHeader);
2630
-
2631
- // Generate the code that checks in runtime if arrays overlap. We put the
2632
- // checks into a separate block to make the more common case of few elements
2633
- // faster.
2634
- emitMemRuntimeChecks (LoopScalarPreHeader);
2635
-
2636
2524
replaceVPBBWithIRVPBB (Plan.getScalarPreheader (), LoopScalarPreHeader);
2637
2525
return LoopVectorPreHeader;
2638
2526
}
@@ -7323,11 +7211,22 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
7323
7211
OrigLoop->getHeader ()->getContext ());
7324
7212
VPlanTransforms::runPass (VPlanTransforms::replicateByVF, BestVPlan, BestVF);
7325
7213
VPlanTransforms::runPass (VPlanTransforms::materializeBroadcasts, BestVPlan);
7326
- if (hasBranchWeightMD (*OrigLoop->getLoopLatch ()->getTerminator ())) {
7214
+ bool HasBranchWeights =
7215
+ hasBranchWeightMD (*OrigLoop->getLoopLatch ()->getTerminator ());
7216
+ if (HasBranchWeights) {
7327
7217
std::optional<unsigned > VScale = CM.getVScaleForTuning ();
7328
7218
VPlanTransforms::runPass (VPlanTransforms::addBranchWeightToMiddleTerminator,
7329
7219
BestVPlan, BestVF, VScale);
7330
7220
}
7221
+
7222
+ if (!VectorizingEpilogue) {
7223
+ // Checks are the same for all VPlans, added to BestVPlan only for
7224
+ // compactness.
7225
+ attachRuntimeChecks (BestVPlan, ILV.RTChecks , HasBranchWeights);
7226
+ }
7227
+
7228
+ // Retrieving VectorPH now when it's easier while VPlan still has Regions.
7229
+ VPBasicBlock *VectorPH = cast<VPBasicBlock>(BestVPlan.getVectorPreheader ());
7331
7230
VPlanTransforms::optimizeForVFAndUF (BestVPlan, BestVF, BestUF, PSE);
7332
7231
VPlanTransforms::simplifyRecipes (BestVPlan, *Legal->getWidestInductionType ());
7333
7232
VPlanTransforms::narrowInterleaveGroups (
@@ -7375,7 +7274,8 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
7375
7274
7376
7275
// 1. Set up the skeleton for vectorization, including vector pre-header and
7377
7276
// middle block. The vector loop is created during VPlan execution.
7378
- VPBasicBlock *VectorPH = cast<VPBasicBlock>(Entry->getSuccessors ()[1 ]);
7277
+ BasicBlock *EntryBB =
7278
+ cast<VPIRBasicBlock>(BestVPlan.getEntry ())->getIRBasicBlock ();
7379
7279
State.CFG .PrevBB = ILV.createVectorizedLoopSkeleton ();
7380
7280
if (VectorizingEpilogue)
7381
7281
VPlanTransforms::removeDeadRecipes (BestVPlan);
@@ -7399,6 +7299,13 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
7399
7299
ILV.getOrCreateVectorTripCount (ILV.LoopVectorPreHeader ), State);
7400
7300
replaceVPBBWithIRVPBB (VectorPH, State.CFG .PrevBB );
7401
7301
7302
+ // Move check blocks to their final position.
7303
+ // TODO: Move as part of VPIRBB execute and update impacted tests.
7304
+ if (BasicBlock *MemCheckBlock = ILV.RTChecks .getMemRuntimeChecks ().second )
7305
+ MemCheckBlock->moveAfter (EntryBB);
7306
+ if (BasicBlock *SCEVCheckBlock = ILV.RTChecks .getSCEVChecks ().second )
7307
+ SCEVCheckBlock->moveAfter (EntryBB);
7308
+
7402
7309
BestVPlan.execute (&State);
7403
7310
7404
7311
// 2.5 When vectorizing the epilogue, fix reduction resume values from the
@@ -7499,15 +7406,6 @@ BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
7499
7406
emitIterationCountCheck (LoopScalarPreHeader, true );
7500
7407
EPI.EpilogueIterationCountCheck ->setName (" iter.check" );
7501
7408
7502
- // Generate the code to check any assumptions that we've made for SCEV
7503
- // expressions.
7504
- EPI.SCEVSafetyCheck = emitSCEVChecks (LoopScalarPreHeader);
7505
-
7506
- // Generate the code that checks at runtime if arrays overlap. We put the
7507
- // checks into a separate block to make the more common case of few elements
7508
- // faster.
7509
- EPI.MemSafetyCheck = emitMemRuntimeChecks (LoopScalarPreHeader);
7510
-
7511
7409
// Generate the iteration count check for the main loop, *after* the check
7512
7410
// for the epilogue loop, so that the path-length is shorter for the case
7513
7411
// that goes directly through the vector epilogue. The longer-path length for
@@ -7611,11 +7509,14 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
7611
7509
EPI.EpilogueIterationCountCheck ->getTerminator ()->replaceUsesOfWith (
7612
7510
VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7613
7511
7614
- if (EPI.SCEVSafetyCheck )
7615
- EPI.SCEVSafetyCheck ->getTerminator ()->replaceUsesOfWith (
7512
+ // Adjust the terminators of runtime check blocks and phis using them.
7513
+ BasicBlock *SCEVCheckBlock = RTChecks.getSCEVChecks ().second ;
7514
+ BasicBlock *MemCheckBlock = RTChecks.getMemRuntimeChecks ().second ;
7515
+ if (SCEVCheckBlock)
7516
+ SCEVCheckBlock->getTerminator ()->replaceUsesOfWith (
7616
7517
VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7617
- if (EPI. MemSafetyCheck )
7618
- EPI. MemSafetyCheck ->getTerminator ()->replaceUsesOfWith (
7518
+ if (MemCheckBlock )
7519
+ MemCheckBlock ->getTerminator ()->replaceUsesOfWith (
7619
7520
VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7620
7521
7621
7522
DT->changeImmediateDominator (LoopScalarPreHeader,
@@ -7642,10 +7543,10 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
7642
7543
}))
7643
7544
continue ;
7644
7545
Phi->removeIncomingValue (EPI.EpilogueIterationCountCheck );
7645
- if (EPI. SCEVSafetyCheck )
7646
- Phi->removeIncomingValue (EPI. SCEVSafetyCheck );
7647
- if (EPI. MemSafetyCheck )
7648
- Phi->removeIncomingValue (EPI. MemSafetyCheck );
7546
+ if (SCEVCheckBlock )
7547
+ Phi->removeIncomingValue (SCEVCheckBlock );
7548
+ if (MemCheckBlock )
7549
+ Phi->removeIncomingValue (MemCheckBlock );
7649
7550
}
7650
7551
7651
7552
replaceVPBBWithIRVPBB (Plan.getScalarPreheader (), LoopScalarPreHeader);
@@ -9380,6 +9281,43 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
9380
9281
VPlanTransforms::runPass (VPlanTransforms::clearReductionWrapFlags, *Plan);
9381
9282
}
9382
9283
9284
+ void LoopVectorizationPlanner::attachRuntimeChecks (
9285
+ VPlan &Plan, GeneratedRTChecks &RTChecks, bool HasBranchWeights) const {
9286
+ const auto &[SCEVCheckCond, SCEVCheckBlock] = RTChecks.getSCEVChecks ();
9287
+ if (SCEVCheckBlock) {
9288
+ assert ((!CM.OptForSize ||
9289
+ CM.Hints ->getForce () == LoopVectorizeHints::FK_Enabled) &&
9290
+ " Cannot SCEV check stride or overflow when optimizing for size" );
9291
+ VPlanTransforms::attachCheckBlock (Plan, SCEVCheckCond, SCEVCheckBlock,
9292
+ HasBranchWeights);
9293
+ }
9294
+ const auto &[MemCheckCond, MemCheckBlock] = RTChecks.getMemRuntimeChecks ();
9295
+ if (MemCheckBlock) {
9296
+ // VPlan-native path does not do any analysis for runtime checks
9297
+ // currently.
9298
+ assert ((!EnableVPlanNativePath || OrigLoop->isInnermost ()) &&
9299
+ " Runtime checks are not supported for outer loops yet" );
9300
+
9301
+ if (CM.OptForSize ) {
9302
+ assert (
9303
+ CM.Hints ->getForce () == LoopVectorizeHints::FK_Enabled &&
9304
+ " Cannot emit memory checks when optimizing for size, unless forced "
9305
+ " to vectorize." );
9306
+ ORE->emit ([&]() {
9307
+ return OptimizationRemarkAnalysis (DEBUG_TYPE, " VectorizationCodeSize" ,
9308
+ OrigLoop->getStartLoc (),
9309
+ OrigLoop->getHeader ())
9310
+ << " Code-size may be reduced by not forcing "
9311
+ " vectorization, or by source-code modifications "
9312
+ " eliminating the need for runtime checks "
9313
+ " (e.g., adding 'restrict')." ;
9314
+ });
9315
+ }
9316
+ VPlanTransforms::attachCheckBlock (Plan, MemCheckCond, MemCheckBlock,
9317
+ HasBranchWeights);
9318
+ }
9319
+ }
9320
+
9383
9321
void VPDerivedIVRecipe::execute (VPTransformState &State) {
9384
9322
assert (!State.Lane && " VPDerivedIVRecipe being replicated." );
9385
9323
@@ -9501,10 +9439,7 @@ static bool processLoopInVPlanNativePath(
9501
9439
VPlan &BestPlan = LVP.getPlanFor (VF.Width );
9502
9440
9503
9441
{
9504
- bool AddBranchWeights =
9505
- hasBranchWeightMD (*L->getLoopLatch ()->getTerminator ());
9506
- GeneratedRTChecks Checks (PSE, DT, LI, TTI, F->getDataLayout (),
9507
- AddBranchWeights, CM.CostKind );
9442
+ GeneratedRTChecks Checks (PSE, DT, LI, TTI, F->getDataLayout (), CM.CostKind );
9508
9443
InnerLoopVectorizer LB (L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width ,
9509
9444
VF.Width , 1 , &CM, BFI, PSI, Checks, BestPlan);
9510
9445
LLVM_DEBUG (dbgs () << " Vectorizing outer loop in \" "
@@ -10142,10 +10077,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
10142
10077
if (ORE->allowExtraAnalysis (LV_NAME))
10143
10078
LVP.emitInvalidCostRemarks (ORE);
10144
10079
10145
- bool AddBranchWeights =
10146
- hasBranchWeightMD (*L->getLoopLatch ()->getTerminator ());
10147
- GeneratedRTChecks Checks (PSE, DT, LI, TTI, F->getDataLayout (),
10148
- AddBranchWeights, CM.CostKind );
10080
+ GeneratedRTChecks Checks (PSE, DT, LI, TTI, F->getDataLayout (), CM.CostKind );
10149
10081
if (LVP.hasPlanWithVF (VF.Width )) {
10150
10082
// Select the interleave count.
10151
10083
IC = CM.selectInterleaveCount (LVP.getPlanFor (VF.Width ), VF.Width , VF.Cost );
0 commit comments