Skip to content

Commit 0a4dd84

Browse files
committed
[LV] Use VPReductionRecipe for partial reductions
Partial reductions can easily be represented by the VPReductionRecipe class by setting their scale factor to something greater than 1. This PR merges the two together and gives VPReductionRecipe a VFScaleFactor so that it can choose to generate the partial reduction intrinsic at execute time. Depends on llvm#144281
1 parent 7263435 commit 0a4dd84

File tree

11 files changed

+467
-885
lines changed

11 files changed

+467
-885
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 20 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -7045,7 +7045,8 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
70457045
}
70467046
// The VPlan-based cost model is more accurate for partial reduction and
70477047
// comparing against the legacy cost isn't desirable.
7048-
if (isa<VPPartialReductionRecipe>(&R))
7048+
if (auto *VPR = dyn_cast<VPReductionRecipe>(&R);
7049+
VPR && VPR->isPartialReduction())
70497050
return true;
70507051

70517052
/// If a VPlan transform folded a recipe to one producing a single-scalar,
@@ -8307,11 +8308,14 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R,
83078308
Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
83088309

83098310
// If the PHI is used by a partial reduction, set the scale factor.
8310-
unsigned ScaleFactor =
8311-
getScalingForReduction(RdxDesc.getLoopExitInstr()).value_or(1);
8312-
PhiRecipe = new VPReductionPHIRecipe(
8313-
Phi, RdxDesc, *StartV, CM.isInLoopReduction(Phi),
8314-
CM.useOrderedReductions(RdxDesc), ScaleFactor);
8311+
bool UseInLoopReduction = CM.isInLoopReduction(Phi);
8312+
bool UseOrderedReductions = CM.useOrderedReductions(RdxDesc);
8313+
auto ScaleFactor =
8314+
(UseOrderedReductions || UseInLoopReduction)
8315+
? 0
8316+
: getScalingForReduction(RdxDesc.getLoopExitInstr()).value_or(1);
8317+
PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
8318+
UseOrderedReductions, ScaleFactor);
83158319
} else {
83168320
// TODO: Currently fixed-order recurrences are modeled as chains of
83178321
// first-order recurrences. If there are no users of the intermediate
@@ -8375,7 +8379,8 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
83758379
VPValue *Accumulator = Operands[1];
83768380
VPRecipeBase *BinOpRecipe = BinOp->getDefiningRecipe();
83778381
if (isa<VPReductionPHIRecipe>(BinOpRecipe) ||
8378-
isa<VPPartialReductionRecipe>(BinOpRecipe))
8382+
(isa<VPReductionRecipe>(BinOpRecipe) &&
8383+
cast<VPReductionRecipe>(BinOpRecipe)->isPartialReduction()))
83798384
std::swap(BinOp, Accumulator);
83808385

83818386
unsigned ReductionOpcode = Reduction->getOpcode();
@@ -8396,12 +8401,10 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
83968401
"Expected an ADD or SUB operation for predicated partial "
83978402
"reductions (because the neutral element in the mask is zero)!");
83988403
Cond = getBlockInMask(Builder.getInsertBlock());
8399-
VPValue *Zero =
8400-
Plan.getOrAddLiveIn(ConstantInt::get(Reduction->getType(), 0));
8401-
BinOp = Builder.createSelect(Cond, BinOp, Zero, Reduction->getDebugLoc());
84028404
}
8403-
return new VPPartialReductionRecipe(ReductionOpcode, Accumulator, BinOp, Cond,
8404-
ScaleFactor, Reduction);
8405+
8406+
return new VPReductionRecipe(RecurKind::Add, FastMathFlags(), Reduction,
8407+
Accumulator, BinOp, Cond, false, ScaleFactor);
84058408
}
84068409

84078410
void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
@@ -9189,9 +9192,11 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
91899192
FastMathFlags FMFs = isa<FPMathOperator>(CurrentLinkI)
91909193
? RdxDesc.getFastMathFlags()
91919194
: FastMathFlags();
9195+
bool UseOrderedReductions = CM.useOrderedReductions(RdxDesc);
9196+
unsigned VFScaleFactor = !UseOrderedReductions;
91929197
auto *RedRecipe = new VPReductionRecipe(
91939198
Kind, FMFs, CurrentLinkI, PreviousLink, VecOp, CondOp,
9194-
CM.useOrderedReductions(RdxDesc), CurrentLinkI->getDebugLoc());
9199+
UseOrderedReductions, VFScaleFactor, CurrentLinkI->getDebugLoc());
91959200
// Append the recipe to the end of the VPBasicBlock because we need to
91969201
// ensure that it comes after all of it's inputs, including CondOp.
91979202
// Delete CurrentLink as it will be invalid if its operand is replaced
@@ -9225,8 +9230,9 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
92259230
// Don't output selects for partial reductions because they have an output
92269231
// with fewer lanes than the VF. So the operands of the select would have
92279232
// different numbers of lanes. Partial reductions mask the input instead.
9233+
auto *RR = dyn_cast<VPReductionRecipe>(OrigExitingVPV->getDefiningRecipe());
92289234
if (!PhiR->isInLoop() && CM.foldTailByMasking() &&
9229-
!isa<VPPartialReductionRecipe>(OrigExitingVPV->getDefiningRecipe())) {
9235+
(!RR || !RR->isPartialReduction())) {
92309236
VPValue *Cond = RecipeBuilder.getBlockInMask(PhiR->getParent());
92319237
std::optional<FastMathFlags> FMFs =
92329238
PhiTy->isFloatingPointTy()

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 45 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -552,7 +552,6 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
552552
case VPRecipeBase::VPWidenIntOrFpInductionSC:
553553
case VPRecipeBase::VPWidenPointerInductionSC:
554554
case VPRecipeBase::VPReductionPHISC:
555-
case VPRecipeBase::VPPartialReductionSC:
556555
return true;
557556
case VPRecipeBase::VPBranchOnMaskSC:
558557
case VPRecipeBase::VPInterleaveSC:
@@ -2194,34 +2193,37 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe,
21942193
/// Descriptor for the reduction.
21952194
const RecurrenceDescriptor &RdxDesc;
21962195

2197-
/// The phi is part of an in-loop reduction.
2198-
bool IsInLoop;
2199-
22002196
/// The phi is part of an ordered reduction. Requires IsInLoop to be true.
22012197
bool IsOrdered;
22022198

2203-
/// When expanding the reduction PHI, the plan's VF element count is divided
2204-
/// by this factor to form the reduction phi's VF.
2205-
unsigned VFScaleFactor = 1;
2199+
/// The scaling factor, relative to the VF, that this recipe's output is
2200+
/// divided by.
2201+
/// For outer-loop reductions this is equal to 1.
2202+
/// For in-loop reductions this is equal to 0, to specify that this is equal
2203+
/// to the VF (which may not be known yet). For partial-reductions this is
2204+
/// equal to another scalar value.
2205+
unsigned VFScaleFactor;
22062206

22072207
public:
22082208
/// Create a new VPReductionPHIRecipe for the reduction \p Phi described by \p
22092209
/// RdxDesc.
22102210
VPReductionPHIRecipe(PHINode *Phi, const RecurrenceDescriptor &RdxDesc,
2211-
VPValue &Start, bool IsInLoop = false,
2212-
bool IsOrdered = false, unsigned VFScaleFactor = 1)
2211+
VPValue &Start, bool IsOrdered = false,
2212+
unsigned VFScaleFactor = 1)
22132213
: VPHeaderPHIRecipe(VPDef::VPReductionPHISC, Phi, &Start),
2214-
RdxDesc(RdxDesc), IsInLoop(IsInLoop), IsOrdered(IsOrdered),
2215-
VFScaleFactor(VFScaleFactor) {
2216-
assert((!IsOrdered || IsInLoop) && "IsOrdered requires IsInLoop");
2214+
RdxDesc(RdxDesc), IsOrdered(IsOrdered), VFScaleFactor(VFScaleFactor) {
2215+
assert((!IsOrdered || isInLoop()) &&
2216+
"IsOrdered requires the reduction to be in-loop");
2217+
assert(((!isInLoop() && !IsOrdered) || isInLoop()) &&
2218+
"Invalid VFScaleFactor");
22172219
}
22182220

22192221
~VPReductionPHIRecipe() override = default;
22202222

22212223
VPReductionPHIRecipe *clone() override {
22222224
auto *R = new VPReductionPHIRecipe(
22232225
dyn_cast_or_null<PHINode>(getUnderlyingValue()), RdxDesc,
2224-
*getOperand(0), IsInLoop, IsOrdered, VFScaleFactor);
2226+
*getOperand(0), IsOrdered, VFScaleFactor);
22252227
R->addOperand(getBackedgeValue());
22262228
return R;
22272229
}
@@ -2247,8 +2249,11 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe,
22472249
/// Returns true, if the phi is part of an ordered reduction.
22482250
bool isOrdered() const { return IsOrdered; }
22492251

2250-
/// Returns true, if the phi is part of an in-loop reduction.
2251-
bool isInLoop() const { return IsInLoop; }
2252+
/// Returns true if the phi is part of an in-loop reduction.
2253+
bool isInLoop() const { return VFScaleFactor == 0; }
2254+
2255+
/// Returns true if the reduction outputs a vector with a scaled down VF.
2256+
bool isPartialReduction() const { return VFScaleFactor > 1; }
22522257

22532258
/// Returns true if the recipe only uses the first lane of operand \p Op.
22542259
bool onlyFirstLaneUsed(const VPValue *Op) const override {
@@ -2421,23 +2426,32 @@ class VPInterleaveRecipe : public VPRecipeBase {
24212426
Instruction *getInsertPos() const { return IG->getInsertPos(); }
24222427
};
24232428

2424-
/// A recipe to represent inloop reduction operations, performing a reduction on
2425-
/// a vector operand into a scalar value, and adding the result to a chain.
2426-
/// The Operands are {ChainOp, VecOp, [Condition]}.
2429+
/// A recipe to represent inloop, ordered or partial reduction operations. It
2430+
/// performs a reduction on a vector operand into a scalar (vector in the case
2431+
/// of a partial reduction) value, and adds the result to a chain. The Operands
2432+
/// are {ChainOp, VecOp, [Condition]}.
24272433
class VPReductionRecipe : public VPRecipeWithIRFlags {
24282434
/// The recurrence kind for the reduction in question.
24292435
RecurKind RdxKind;
24302436
bool IsOrdered;
24312437
/// Whether the reduction is conditional.
24322438
bool IsConditional = false;
2439+
/// The scaling factor, relative to the VF, that this recipe's output is
2440+
/// divided by.
2441+
/// For outer-loop reductions this is equal to 1.
2442+
/// For in-loop reductions this is equal to 0, to specify that this is equal
2443+
/// to the VF (which may not be known yet).
2444+
/// For partial-reductions this is equal to another scalar value.
2445+
unsigned VFScaleFactor;
24332446

24342447
protected:
24352448
VPReductionRecipe(const unsigned char SC, RecurKind RdxKind,
24362449
FastMathFlags FMFs, Instruction *I,
24372450
ArrayRef<VPValue *> Operands, VPValue *CondOp,
2438-
bool IsOrdered, DebugLoc DL)
2451+
bool IsOrdered, unsigned VFScaleFactor, DebugLoc DL)
24392452
: VPRecipeWithIRFlags(SC, Operands, FMFs, DL), RdxKind(RdxKind),
2440-
IsOrdered(IsOrdered) {
2453+
IsOrdered(IsOrdered), VFScaleFactor(VFScaleFactor) {
2454+
assert((!IsOrdered || VFScaleFactor == 0) && "Invalid scale factor");
24412455
if (CondOp) {
24422456
IsConditional = true;
24432457
addOperand(CondOp);
@@ -2448,30 +2462,29 @@ class VPReductionRecipe : public VPRecipeWithIRFlags {
24482462
public:
24492463
VPReductionRecipe(RecurKind RdxKind, FastMathFlags FMFs, Instruction *I,
24502464
VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp,
2451-
bool IsOrdered, DebugLoc DL = {})
2465+
bool IsOrdered, unsigned VFScaleFactor, DebugLoc DL = {})
24522466
: VPReductionRecipe(VPDef::VPReductionSC, RdxKind, FMFs, I,
24532467
ArrayRef<VPValue *>({ChainOp, VecOp}), CondOp,
2454-
IsOrdered, DL) {}
2468+
IsOrdered, VFScaleFactor, DL) {}
24552469

24562470
VPReductionRecipe(const RecurKind RdxKind, FastMathFlags FMFs,
24572471
VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp,
2458-
bool IsOrdered, DebugLoc DL = {})
2472+
bool IsOrdered, unsigned VFScaleFactor, DebugLoc DL = {})
24592473
: VPReductionRecipe(VPDef::VPReductionSC, RdxKind, FMFs, nullptr,
24602474
ArrayRef<VPValue *>({ChainOp, VecOp}), CondOp,
2461-
IsOrdered, DL) {}
2475+
IsOrdered, VFScaleFactor, DL) {}
24622476

24632477
~VPReductionRecipe() override = default;
24642478

24652479
VPReductionRecipe *clone() override {
2466-
return new VPReductionRecipe(RdxKind, getFastMathFlags(),
2467-
getUnderlyingInstr(), getChainOp(), getVecOp(),
2468-
getCondOp(), IsOrdered, getDebugLoc());
2480+
return new VPReductionRecipe(
2481+
RdxKind, getFastMathFlags(), getUnderlyingInstr(), getChainOp(),
2482+
getVecOp(), getCondOp(), IsOrdered, VFScaleFactor, getDebugLoc());
24692483
}
24702484

24712485
static inline bool classof(const VPRecipeBase *R) {
24722486
return R->getVPDefID() == VPRecipeBase::VPReductionSC ||
2473-
R->getVPDefID() == VPRecipeBase::VPReductionEVLSC ||
2474-
R->getVPDefID() == VPRecipeBase::VPPartialReductionSC;
2487+
R->getVPDefID() == VPRecipeBase::VPReductionEVLSC;
24752488
}
24762489

24772490
static inline bool classof(const VPUser *U) {
@@ -2498,6 +2511,8 @@ class VPReductionRecipe : public VPRecipeWithIRFlags {
24982511
bool isOrdered() const { return IsOrdered; };
24992512
/// Return true if the in-loop reduction is conditional.
25002513
bool isConditional() const { return IsConditional; };
2514+
/// Returns true if the reduction outputs a vector with a scaled down VF.
2515+
bool isPartialReduction() const { return VFScaleFactor > 1; }
25012516
/// The VPValue of the scalar Chain being accumulated.
25022517
VPValue *getChainOp() const { return getOperand(0); }
25032518
/// The VPValue of the vector value to be reduced.
@@ -2506,68 +2521,8 @@ class VPReductionRecipe : public VPRecipeWithIRFlags {
25062521
VPValue *getCondOp() const {
25072522
return isConditional() ? getOperand(getNumOperands() - 1) : nullptr;
25082523
}
2509-
};
2510-
2511-
/// A recipe for forming partial reductions. In the loop, an accumulator and
2512-
/// vector operand are added together and passed to the next iteration as the
2513-
/// next accumulator. After the loop body, the accumulator is reduced to a
2514-
/// scalar value.
2515-
class VPPartialReductionRecipe : public VPReductionRecipe {
2516-
unsigned Opcode;
2517-
2518-
/// The divisor by which the VF of this recipe's output should be divided
2519-
/// during execution.
2520-
unsigned VFScaleFactor;
2521-
2522-
public:
2523-
VPPartialReductionRecipe(Instruction *ReductionInst, VPValue *Op0,
2524-
VPValue *Op1, VPValue *Cond, unsigned VFScaleFactor)
2525-
: VPPartialReductionRecipe(ReductionInst->getOpcode(), Op0, Op1, Cond,
2526-
VFScaleFactor, ReductionInst) {}
2527-
VPPartialReductionRecipe(unsigned Opcode, VPValue *Op0, VPValue *Op1,
2528-
VPValue *Cond, unsigned ScaleFactor,
2529-
Instruction *ReductionInst = nullptr)
2530-
: VPReductionRecipe(VPDef::VPPartialReductionSC, RecurKind::Add,
2531-
FastMathFlags(), ReductionInst,
2532-
ArrayRef<VPValue *>({Op0, Op1}), Cond, false, {}),
2533-
Opcode(Opcode), VFScaleFactor(ScaleFactor) {
2534-
[[maybe_unused]] auto *AccumulatorRecipe =
2535-
getChainOp()->getDefiningRecipe();
2536-
// When cloning as part of a VPExpressionRecipe, the chain op could have
2537-
// been removed from the plan and so doesn't have a defining recipe.
2538-
assert((!AccumulatorRecipe ||
2539-
isa<VPReductionPHIRecipe>(AccumulatorRecipe) ||
2540-
isa<VPPartialReductionRecipe>(AccumulatorRecipe)) &&
2541-
"Unexpected operand order for partial reduction recipe");
2542-
}
2543-
~VPPartialReductionRecipe() override = default;
2544-
2545-
VPPartialReductionRecipe *clone() override {
2546-
return new VPPartialReductionRecipe(Opcode, getOperand(0), getOperand(1),
2547-
getCondOp(), VFScaleFactor,
2548-
getUnderlyingInstr());
2549-
}
2550-
2551-
VP_CLASSOF_IMPL(VPDef::VPPartialReductionSC)
2552-
2553-
/// Generate the reduction in the loop.
2554-
void execute(VPTransformState &State) override;
2555-
2556-
/// Return the cost of this VPPartialReductionRecipe.
2557-
InstructionCost computeCost(ElementCount VF,
2558-
VPCostContext &Ctx) const override;
2559-
2560-
/// Get the binary op's opcode.
2561-
unsigned getOpcode() const { return Opcode; }
2562-
25632524
/// Get the factor that the VF of this recipe's output should be scaled by.
25642525
unsigned getVFScaleFactor() const { return VFScaleFactor; }
2565-
2566-
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2567-
/// Print the recipe.
2568-
void print(raw_ostream &O, const Twine &Indent,
2569-
VPSlotTracker &SlotTracker) const override;
2570-
#endif
25712526
};
25722527

25732528
/// A recipe to represent inloop reduction operations with vector-predication
@@ -2583,7 +2538,7 @@ class VPReductionEVLRecipe : public VPReductionRecipe {
25832538
R.getFastMathFlags(),
25842539
cast_or_null<Instruction>(R.getUnderlyingValue()),
25852540
ArrayRef<VPValue *>({R.getChainOp(), R.getVecOp(), &EVL}), CondOp,
2586-
R.isOrdered(), DL) {}
2541+
R.isOrdered(), 0, DL) {}
25872542

25882543
~VPReductionEVLRecipe() override = default;
25892544

llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -282,10 +282,10 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
282282
[](const auto *R) { return R->getScalarType(); })
283283
.Case<VPReductionRecipe, VPPredInstPHIRecipe, VPWidenPHIRecipe,
284284
VPScalarIVStepsRecipe, VPWidenGEPRecipe, VPVectorPointerRecipe,
285-
VPVectorEndPointerRecipe, VPWidenCanonicalIVRecipe,
286-
VPPartialReductionRecipe>([this](const VPRecipeBase *R) {
287-
return inferScalarType(R->getOperand(0));
288-
})
285+
VPVectorEndPointerRecipe, VPWidenCanonicalIVRecipe>(
286+
[this](const VPRecipeBase *R) {
287+
return inferScalarType(R->getOperand(0));
288+
})
289289
// VPInstructionWithType must be handled before VPInstruction.
290290
.Case<VPInstructionWithType, VPWidenIntrinsicRecipe,
291291
VPWidenCastRecipe>(
@@ -396,7 +396,7 @@ bool VPDominatorTree::properlyDominates(const VPRecipeBase *A,
396396
static unsigned getVFScaleFactor(VPRecipeBase *R) {
397397
if (auto *RR = dyn_cast<VPReductionPHIRecipe>(R))
398398
return RR->getVFScaleFactor();
399-
if (auto *RR = dyn_cast<VPPartialReductionRecipe>(R))
399+
if (auto *RR = dyn_cast<VPReductionRecipe>(R))
400400
return RR->getVFScaleFactor();
401401
assert(
402402
(!isa<VPInstruction>(R) || cast<VPInstruction>(R)->getOpcode() !=
@@ -566,8 +566,9 @@ SmallVector<VPRegisterUsage, 8> llvm::calculateRegisterUsageForPlan(
566566
} else {
567567
// The output from scaled phis and scaled reductions actually has
568568
// fewer lanes than the VF.
569-
unsigned ScaleFactor = getVFScaleFactor(R);
570-
ElementCount VF = VFs[J].divideCoefficientBy(ScaleFactor);
569+
ElementCount VF = VFs[J];
570+
if (unsigned ScaleFactor = getVFScaleFactor(R))
571+
VF = VF.divideCoefficientBy(ScaleFactor);
571572
LLVM_DEBUG(if (VF != VFs[J]) {
572573
dbgs() << "LV(REG): Scaled down VF from " << VFs[J] << " to " << VF
573574
<< " for " << *R << "\n";

0 commit comments

Comments
 (0)