-
Notifications
You must be signed in to change notification settings - Fork 14.5k
[LoopVectorize] Generate wide active lane masks #147535
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -356,6 +356,10 @@ cl::opt<bool> llvm::EnableVPlanNativePath( | |
cl::desc("Enable VPlan-native vectorization path with " | ||
"support for outer loop vectorization.")); | ||
|
||
cl::opt<bool> llvm::EnableWideActiveLaneMask( | ||
"enable-wide-lane-mask", cl::init(false), cl::Hidden, | ||
cl::desc("Enable use of wide get active lane mask instructions")); | ||
|
||
cl::opt<bool> | ||
llvm::VerifyEachVPlan("vplan-verify-each", | ||
#ifdef EXPENSIVE_CHECKS | ||
|
@@ -7328,7 +7332,10 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan( | |
VPlanTransforms::runPass(VPlanTransforms::addBranchWeightToMiddleTerminator, | ||
BestVPlan, BestVF, VScale); | ||
} | ||
VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE); | ||
VPlanTransforms::optimizeForVFAndUF( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Perhaps instead of passing in
What do you think? |
||
BestVPlan, BestVF, BestUF, PSE, | ||
ILV.Cost->getTailFoldingStyle() == | ||
TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck); | ||
VPlanTransforms::simplifyRecipes(BestVPlan, *Legal->getWidestInductionType()); | ||
VPlanTransforms::narrowInterleaveGroups( | ||
BestVPlan, BestVF, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -954,6 +954,9 @@ class VPInstruction : public VPRecipeWithIRFlags, | |
// part if it is scalar. In the latter case, the recipe will be removed | ||
// during unrolling. | ||
ExtractPenultimateElement, | ||
// Extracts a subvector from a vector (first operand) starting at a given | ||
// offset (second operand). | ||
ExtractSubvector, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It would be good to add this to VPInstruction::computeCost to make sure the cost is properly represented in the VPlan, although I know that currently the only use case is during plan execution, after the cost model. |
||
LogicalAnd, // Non-poison propagating logical And. | ||
// Add an offset in bytes (second operand) to a base pointer (first | ||
// operand). Only generates scalar values (either for the first lane only or | ||
|
@@ -1887,6 +1890,9 @@ class VPHeaderPHIRecipe : public VPSingleDefRecipe, public VPPhiAccessors { | |
return getOperand(1); | ||
} | ||
|
||
// Update the incoming value from the loop backedge. | ||
void setBackedgeValue(VPValue *V) { setOperand(1, V); } | ||
|
||
/// Returns the backedge value as a recipe. The backedge value is guaranteed | ||
/// to be a recipe. | ||
virtual VPRecipeBase &getBackedgeRecipe() { | ||
|
@@ -3234,10 +3240,12 @@ class VPCanonicalIVPHIRecipe : public VPHeaderPHIRecipe { | |
/// TODO: It would be good to use the existing VPWidenPHIRecipe instead and | ||
/// remove VPActiveLaneMaskPHIRecipe. | ||
class VPActiveLaneMaskPHIRecipe : public VPHeaderPHIRecipe { | ||
unsigned UnrollPart = 0; | ||
|
||
public: | ||
VPActiveLaneMaskPHIRecipe(VPValue *StartMask, DebugLoc DL) | ||
: VPHeaderPHIRecipe(VPDef::VPActiveLaneMaskPHISC, nullptr, StartMask, | ||
DL) {} | ||
VPActiveLaneMaskPHIRecipe(VPValue *StartMask, DebugLoc DL, unsigned Part = 0) | ||
: VPHeaderPHIRecipe(VPDef::VPActiveLaneMaskPHISC, nullptr, StartMask, DL), | ||
UnrollPart(Part) {} | ||
|
||
~VPActiveLaneMaskPHIRecipe() override = default; | ||
|
||
|
@@ -3250,6 +3258,9 @@ class VPActiveLaneMaskPHIRecipe : public VPHeaderPHIRecipe { | |
|
||
VP_CLASSOF_IMPL(VPDef::VPActiveLaneMaskPHISC) | ||
|
||
unsigned getUnrollPart() { return UnrollPart; } | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would be good to add the |
||
void setUnrollPart(unsigned Part) { UnrollPart = Part; } | ||
|
||
/// Generate the active lane mask phi of the vector loop. | ||
void execute(VPTransformState &State) override; | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -469,15 +469,16 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) { | |
case Instruction::ICmp: | ||
case Instruction::FCmp: | ||
case Instruction::Store: | ||
case VPInstruction::ActiveLaneMask: | ||
case VPInstruction::BranchOnCount: | ||
case VPInstruction::ComputeReductionResult: | ||
case VPInstruction::ExtractSubvector: | ||
case VPInstruction::FirstOrderRecurrenceSplice: | ||
case VPInstruction::LogicalAnd: | ||
case VPInstruction::PtrAdd: | ||
case VPInstruction::WideIVStep: | ||
return 2; | ||
case Instruction::Select: | ||
case VPInstruction::ActiveLaneMask: | ||
case VPInstruction::ComputeAnyOfResult: | ||
case VPInstruction::ReductionStartVector: | ||
return 3; | ||
|
@@ -614,7 +615,9 @@ Value *VPInstruction::generate(VPTransformState &State) { | |
Name); | ||
|
||
auto *Int1Ty = Type::getInt1Ty(Builder.getContext()); | ||
auto *PredTy = VectorType::get(Int1Ty, State.VF); | ||
auto PredTy = VectorType::get( | ||
Int1Ty, State.VF * cast<ConstantInt>(getOperand(2)->getLiveInIRValue()) | ||
->getZExtValue()); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think that given we're now potentially generating a different mask we should update the cost for VPInstruction::ActiveLaneMask in VPInstruction::computeCost if using a wider mask. Again, it's not going to make much difference because the wider mask is generated after the cost model anyway, but good to have it for completeness. |
||
return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, | ||
{PredTy, ScalarTC->getType()}, | ||
{VIVElem0, ScalarTC}, nullptr, Name); | ||
|
@@ -846,6 +849,14 @@ Value *VPInstruction::generate(VPTransformState &State) { | |
Res->setName(Name); | ||
return Res; | ||
} | ||
case VPInstruction::ExtractSubvector: { | ||
Value *Vec = State.get(getOperand(0)); | ||
assert(State.VF.isVector()); | ||
auto Idx = cast<ConstantInt>(getOperand(1)->getLiveInIRValue()); | ||
auto ResTy = VectorType::get( | ||
State.TypeAnalysis.inferScalarType(getOperand(0)), State.VF); | ||
return Builder.CreateExtractVector(ResTy, Vec, Idx); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If this maps 1-1 to an intrinsic, can we just use VPWidenIntrinsic instead? |
||
} | ||
case VPInstruction::LogicalAnd: { | ||
Value *A = State.get(getOperand(0)); | ||
Value *B = State.get(getOperand(1)); | ||
|
@@ -1044,6 +1055,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const { | |
case VPInstruction::CanonicalIVIncrementForPart: | ||
case VPInstruction::ExtractLastElement: | ||
case VPInstruction::ExtractPenultimateElement: | ||
case VPInstruction::ExtractSubvector: | ||
case VPInstruction::FirstActiveLane: | ||
case VPInstruction::FirstOrderRecurrenceSplice: | ||
case VPInstruction::LogicalAnd: | ||
|
@@ -1174,6 +1186,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, | |
case VPInstruction::ExtractPenultimateElement: | ||
O << "extract-penultimate-element"; | ||
break; | ||
case VPInstruction::ExtractSubvector: | ||
O << "extract-subvector"; | ||
break; | ||
case VPInstruction::ComputeAnyOfResult: | ||
O << "compute-anyof-result"; | ||
break; | ||
|
Original file line number | Diff line number | Diff line change | ||
---|---|---|---|---|
|
@@ -12,6 +12,7 @@ | |||
//===----------------------------------------------------------------------===// | ||||
|
||||
#include "VPlanTransforms.h" | ||||
#include "LoopVectorizationPlanner.h" | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is just to get access to |
||||
#include "VPRecipeBuilder.h" | ||||
#include "VPlan.h" | ||||
#include "VPlanAnalysis.h" | ||||
|
@@ -1432,20 +1433,93 @@ static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan, | |||
return SE.isKnownPredicate(CmpInst::ICMP_EQ, TripCount, C); | ||||
} | ||||
|
||||
static void extractFromWideActiveLaneMask(VPlan &Plan, ElementCount VF, | ||||
unsigned UF) { | ||||
VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion(); | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. IIUC this needs to be cost-driven, to only be done when the wider active-lane-mask is profitable? |
||||
auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry()); | ||||
VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock(); | ||||
auto *Term = &ExitingVPBB->back(); | ||||
|
||||
VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV(); | ||||
LLVMContext &Ctx = CanonicalIV->getScalarType()->getContext(); | ||||
using namespace llvm::VPlanPatternMatch; | ||||
|
||||
auto extractFromALM = [&](VPInstruction *ALM, VPInstruction *InsBefore, | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think you can avoid passing in
|
||||
SmallVectorImpl<VPValue *> &Extracts) { | ||||
VPBuilder Builder(InsBefore); | ||||
DebugLoc DL = ALM->getDebugLoc(); | ||||
for (unsigned Part = 0; Part < UF; ++Part) { | ||||
SmallVector<VPValue *> Ops; | ||||
Ops.append({ALM, Plan.getOrAddLiveIn( | ||||
ConstantInt::get(IntegerType::getInt64Ty(Ctx), | ||||
VF.getKnownMinValue() * Part))}); | ||||
Extracts.push_back( | ||||
Builder.createNaryOp(VPInstruction::ExtractSubvector, Ops, DL)); | ||||
} | ||||
}; | ||||
|
||||
// Create a list of each active lane mask phi, ordered by unroll part. | ||||
SmallVector<VPActiveLaneMaskPHIRecipe *> Phis(UF, nullptr); | ||||
for (VPRecipeBase &R : Header->phis()) | ||||
if (auto *Phi = dyn_cast<VPActiveLaneMaskPHIRecipe>(&R)) | ||||
Phis[Phi->getUnrollPart()] = Phi; | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we need the part here? or can we order the active-lane-mask phis by their backedge values? |
||||
|
||||
assert(all_of(Phis, [](VPActiveLaneMaskPHIRecipe *Phi) { return Phi; }) && | ||||
"Expected one VPActiveLaneMaskPHIRecipe for each unroll part"); | ||||
|
||||
// When using wide lane masks, the return type of the get.active.lane.mask | ||||
// intrinsic is VF x UF (second operand). | ||||
VPValue *ALMMultiplier = | ||||
Plan.getOrAddLiveIn(ConstantInt::get(IntegerType::getInt64Ty(Ctx), UF)); | ||||
cast<VPInstruction>(Phis[0]->getStartValue())->setOperand(2, ALMMultiplier); | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it worth asserting that the start value and backedge values are ActiveLaneMask instructions? |
||||
cast<VPInstruction>(Phis[0]->getBackedgeValue()) | ||||
->setOperand(2, ALMMultiplier); | ||||
|
||||
// Create UF x extract vectors and insert into preheader. | ||||
SmallVector<VPValue *> EntryExtracts; | ||||
auto *EntryALM = cast<VPInstruction>(Phis[0]->getStartValue()); | ||||
extractFromALM(EntryALM, cast<VPInstruction>(&EntryALM->getParent()->back()), | ||||
EntryExtracts); | ||||
|
||||
// Create UF x extract vectors and insert before the loop compare & branch, | ||||
// updating the compare to use the first extract. | ||||
SmallVector<VPValue *> LoopExtracts; | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Given you know exactly how many extracts you need I think you can do
|
||||
auto *LoopALM = cast<VPInstruction>(Phis[0]->getBackedgeValue()); | ||||
VPInstruction *Not = cast<VPInstruction>(Term->getOperand(0)); | ||||
extractFromALM(LoopALM, Not, LoopExtracts); | ||||
Not->setOperand(0, LoopExtracts[0]); | ||||
|
||||
// Update the incoming values of active lane mask phis. | ||||
for (unsigned Part = 0; Part < UF; ++Part) { | ||||
Phis[Part]->setStartValue(EntryExtracts[Part]); | ||||
Phis[Part]->setBackedgeValue(LoopExtracts[Part]); | ||||
} | ||||
|
||||
return; | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||
} | ||||
|
||||
/// Try to simplify the branch condition of \p Plan. This may restrict the | ||||
/// resulting plan to \p BestVF and \p BestUF. | ||||
static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, | ||||
unsigned BestUF, | ||||
PredicatedScalarEvolution &PSE) { | ||||
static bool | ||||
simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, | ||||
unsigned BestUF, | ||||
PredicatedScalarEvolution &PSE, | ||||
bool DataAndControlFlowWithoutRuntimeCheck) { | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do we need to check |
||||
VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion(); | ||||
VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock(); | ||||
auto *Term = &ExitingVPBB->back(); | ||||
VPValue *Cond; | ||||
ScalarEvolution &SE = *PSE.getSE(); | ||||
using namespace llvm::VPlanPatternMatch; | ||||
if (match(Term, m_BranchOnCount(m_VPValue(), m_VPValue())) || | ||||
match(Term, m_BranchOnCond( | ||||
m_Not(m_ActiveLaneMask(m_VPValue(), m_VPValue()))))) { | ||||
auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry()); | ||||
bool BranchALM = match(Term, m_BranchOnCond(m_Not(m_ActiveLaneMask( | ||||
m_VPValue(), m_VPValue(), m_VPValue())))); | ||||
|
||||
if (BranchALM || match(Term, m_BranchOnCount(m_VPValue(), m_VPValue()))) { | ||||
if (BranchALM && DataAndControlFlowWithoutRuntimeCheck && | ||||
EnableWideActiveLaneMask && BestVF.isVector() && BestUF > 1) | ||||
extractFromWideActiveLaneMask(Plan, BestVF, BestUF); | ||||
Comment on lines
+1514
to
+1521
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is there any benefit from having this here? It doesn't seem to fit here, as it does not simplify the branch condition directly? |
||||
|
||||
// Try to simplify the branch condition if TC <= VF * UF when the latch | ||||
// terminator is BranchOnCount or BranchOnCond where the input is | ||||
// Not(ActiveLaneMask). | ||||
|
@@ -1470,7 +1544,6 @@ static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, | |||
// The vector loop region only executes once. If possible, completely remove | ||||
// the region, otherwise replace the terminator controlling the latch with | ||||
// (BranchOnCond true). | ||||
auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry()); | ||||
auto *CanIVTy = Plan.getCanonicalIV()->getScalarType(); | ||||
if (all_of( | ||||
Header->phis(), | ||||
|
@@ -1507,14 +1580,15 @@ static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, | |||
return true; | ||||
} | ||||
|
||||
void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, | ||||
unsigned BestUF, | ||||
PredicatedScalarEvolution &PSE) { | ||||
void VPlanTransforms::optimizeForVFAndUF( | ||||
VPlan &Plan, ElementCount BestVF, unsigned BestUF, | ||||
PredicatedScalarEvolution &PSE, | ||||
bool DataAndControlFlowWithoutRuntimeCheck) { | ||||
assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan"); | ||||
assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan"); | ||||
|
||||
bool MadeChange = | ||||
simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE); | ||||
bool MadeChange = simplifyBranchConditionForVFAndUF( | ||||
Plan, BestVF, BestUF, PSE, DataAndControlFlowWithoutRuntimeCheck); | ||||
MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF); | ||||
|
||||
if (MadeChange) { | ||||
|
@@ -2006,9 +2080,11 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch( | |||
"index.part.next"); | ||||
|
||||
// Create the active lane mask instruction in the VPlan preheader. | ||||
auto *EntryALM = | ||||
Builder.createNaryOp(VPInstruction::ActiveLaneMask, {EntryIncrement, TC}, | ||||
DL, "active.lane.mask.entry"); | ||||
VPValue *ALMMultiplier = Plan.getOrAddLiveIn( | ||||
ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1)); | ||||
auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask, | ||||
{EntryIncrement, TC, ALMMultiplier}, DL, | ||||
"active.lane.mask.entry"); | ||||
|
||||
// Now create the ActiveLaneMaskPhi recipe in the main loop using the | ||||
// preheader ActiveLaneMask instruction. | ||||
|
@@ -2023,8 +2099,8 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch( | |||
Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart, | ||||
{IncrementValue}, {false, false}, DL); | ||||
auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask, | ||||
{InLoopIncrement, TripCount}, DL, | ||||
"active.lane.mask.next"); | ||||
{InLoopIncrement, TripCount, ALMMultiplier}, | ||||
DL, "active.lane.mask.next"); | ||||
LaneMaskPhi->addOperand(ALM); | ||||
|
||||
// Replace the original terminator with BranchOnCond. We have to invert the | ||||
|
@@ -2101,9 +2177,12 @@ void VPlanTransforms::addActiveLaneMask( | |||
Plan, DataAndControlFlowWithoutRuntimeCheck); | ||||
} else { | ||||
VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV); | ||||
LaneMask = B.createNaryOp(VPInstruction::ActiveLaneMask, | ||||
{WideCanonicalIV, Plan.getTripCount()}, nullptr, | ||||
"active.lane.mask"); | ||||
VPValue *ALMMultiplier = Plan.getOrAddLiveIn( | ||||
ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1)); | ||||
LaneMask = | ||||
B.createNaryOp(VPInstruction::ActiveLaneMask, | ||||
{WideCanonicalIV, Plan.getTripCount(), ALMMultiplier}, | ||||
nullptr, "active.lane.mask"); | ||||
} | ||||
|
||||
// Walk users of WideCanonicalIV and replace all compares of the form | ||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is useful for testing the new feature, although I guess to enable this by default in future you'll either need a new TTI hook to query the target's preference or compare the costs of using a wider mask + UF extracts with the costs of using UF normal masks and see which is cheapest?