Skip to content

[LoopVectorize] Generate wide active lane masks #147535

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions llvm/lib/Analysis/VectorUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@ bool llvm::isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID,
case Intrinsic::is_fpclass:
case Intrinsic::vp_is_fpclass:
case Intrinsic::powi:
case Intrinsic::vector_extract:
return (ScalarOpdIdx == 1);
case Intrinsic::smul_fix:
case Intrinsic::smul_fix_sat:
Expand Down Expand Up @@ -195,6 +196,7 @@ bool llvm::isVectorIntrinsicWithOverloadTypeAtArg(
case Intrinsic::vp_llrint:
case Intrinsic::ucmp:
case Intrinsic::scmp:
case Intrinsic::vector_extract:
return OpdIdx == -1 || OpdIdx == 0;
case Intrinsic::modf:
case Intrinsic::sincos:
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ class VPRecipeBuilder;
struct VFRange;

extern cl::opt<bool> EnableVPlanNativePath;
extern cl::opt<bool> EnableWideActiveLaneMask;
extern cl::opt<unsigned> ForceTargetInstructionCost;

/// VPlan-based builder utility analogous to IRBuilder.
Expand Down
9 changes: 8 additions & 1 deletion llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -356,6 +356,10 @@ cl::opt<bool> llvm::EnableVPlanNativePath(
cl::desc("Enable VPlan-native vectorization path with "
"support for outer loop vectorization."));

cl::opt<bool> llvm::EnableWideActiveLaneMask(
"enable-wide-lane-mask", cl::init(false), cl::Hidden,
cl::desc("Enable use of wide get active lane mask instructions"));

cl::opt<bool>
llvm::VerifyEachVPlan("vplan-verify-each",
#ifdef EXPENSIVE_CHECKS
Expand Down Expand Up @@ -7328,7 +7332,10 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
VPlanTransforms::runPass(VPlanTransforms::addBranchWeightToMiddleTerminator,
BestVPlan, BestVF, VScale);
}
VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
VPlanTransforms::optimizeForVFAndUF(
BestVPlan, BestVF, BestUF, PSE,
ILV.Cost->getTailFoldingStyle() ==
TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck);
VPlanTransforms::simplifyRecipes(BestVPlan, *Legal->getWidestInductionType());
VPlanTransforms::narrowInterleaveGroups(
BestVPlan, BestVF,
Expand Down
17 changes: 14 additions & 3 deletions llvm/lib/Transforms/Vectorize/VPlan.h
Original file line number Diff line number Diff line change
Expand Up @@ -954,6 +954,9 @@ class VPInstruction : public VPRecipeWithIRFlags,
// part if it is scalar. In the latter case, the recipe will be removed
// during unrolling.
ExtractPenultimateElement,
// Extracts a subvector from a vector (first operand) starting at a given
// offset (second operand).
ExtractSubvector,
LogicalAnd, // Non-poison propagating logical And.
// Add an offset in bytes (second operand) to a base pointer (first
// operand). Only generates scalar values (either for the first lane only or
Expand Down Expand Up @@ -1887,6 +1890,9 @@ class VPHeaderPHIRecipe : public VPSingleDefRecipe, public VPPhiAccessors {
return getOperand(1);
}

// Update the incoming value from the loop backedge.
void setBackedgeValue(VPValue *V) { setOperand(1, V); }

/// Returns the backedge value as a recipe. The backedge value is guaranteed
/// to be a recipe.
virtual VPRecipeBase &getBackedgeRecipe() {
Expand Down Expand Up @@ -3234,10 +3240,12 @@ class VPCanonicalIVPHIRecipe : public VPHeaderPHIRecipe {
/// TODO: It would be good to use the existing VPWidenPHIRecipe instead and
/// remove VPActiveLaneMaskPHIRecipe.
class VPActiveLaneMaskPHIRecipe : public VPHeaderPHIRecipe {
unsigned UnrollPart = 0;

public:
VPActiveLaneMaskPHIRecipe(VPValue *StartMask, DebugLoc DL)
: VPHeaderPHIRecipe(VPDef::VPActiveLaneMaskPHISC, nullptr, StartMask,
DL) {}
VPActiveLaneMaskPHIRecipe(VPValue *StartMask, DebugLoc DL, unsigned Part = 0)
: VPHeaderPHIRecipe(VPDef::VPActiveLaneMaskPHISC, nullptr, StartMask, DL),
UnrollPart(Part) {}

~VPActiveLaneMaskPHIRecipe() override = default;

Expand All @@ -3250,6 +3258,9 @@ class VPActiveLaneMaskPHIRecipe : public VPHeaderPHIRecipe {

VP_CLASSOF_IMPL(VPDef::VPActiveLaneMaskPHISC)

unsigned getUnrollPart() { return UnrollPart; }
void setUnrollPart(unsigned Part) { UnrollPart = Part; }

/// Generate the active lane mask phi of the vector loop.
void execute(VPTransformState &State) override;

Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
switch (Opcode) {
case Instruction::ExtractElement:
case Instruction::Freeze:
case VPInstruction::ExtractSubvector:
case VPInstruction::ReductionStartVector:
return inferScalarType(R->getOperand(0));
case Instruction::Select: {
Expand Down
9 changes: 5 additions & 4 deletions llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
Original file line number Diff line number Diff line change
Expand Up @@ -384,10 +384,11 @@ m_Broadcast(const Op0_t &Op0) {
return m_VPInstruction<VPInstruction::Broadcast>(Op0);
}

template <typename Op0_t, typename Op1_t>
inline BinaryVPInstruction_match<Op0_t, Op1_t, VPInstruction::ActiveLaneMask>
m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1) {
return m_VPInstruction<VPInstruction::ActiveLaneMask>(Op0, Op1);
template <typename Op0_t, typename Op1_t, typename Op2_t>
inline TernaryVPInstruction_match<Op0_t, Op1_t, Op2_t,
VPInstruction::ActiveLaneMask>
m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2) {
return m_VPInstruction<VPInstruction::ActiveLaneMask>(Op0, Op1, Op2);
}

template <typename Op0_t, typename Op1_t>
Expand Down
19 changes: 17 additions & 2 deletions llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -469,15 +469,16 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
case Instruction::ICmp:
case Instruction::FCmp:
case Instruction::Store:
case VPInstruction::ActiveLaneMask:
case VPInstruction::BranchOnCount:
case VPInstruction::ComputeReductionResult:
case VPInstruction::ExtractSubvector:
case VPInstruction::FirstOrderRecurrenceSplice:
case VPInstruction::LogicalAnd:
case VPInstruction::PtrAdd:
case VPInstruction::WideIVStep:
return 2;
case Instruction::Select:
case VPInstruction::ActiveLaneMask:
case VPInstruction::ComputeAnyOfResult:
case VPInstruction::ReductionStartVector:
return 3;
Expand Down Expand Up @@ -614,7 +615,9 @@ Value *VPInstruction::generate(VPTransformState &State) {
Name);

auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
auto *PredTy = VectorType::get(Int1Ty, State.VF);
auto PredTy = VectorType::get(
Int1Ty, State.VF * cast<ConstantInt>(getOperand(2)->getLiveInIRValue())
->getZExtValue());
return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
{PredTy, ScalarTC->getType()},
{VIVElem0, ScalarTC}, nullptr, Name);
Expand Down Expand Up @@ -846,6 +849,14 @@ Value *VPInstruction::generate(VPTransformState &State) {
Res->setName(Name);
return Res;
}
case VPInstruction::ExtractSubvector: {
Value *Vec = State.get(getOperand(0));
assert(State.VF.isVector());
auto Idx = cast<ConstantInt>(getOperand(1)->getLiveInIRValue());
auto ResTy = VectorType::get(
State.TypeAnalysis.inferScalarType(getOperand(0)), State.VF);
return Builder.CreateExtractVector(ResTy, Vec, Idx);
}
case VPInstruction::LogicalAnd: {
Value *A = State.get(getOperand(0));
Value *B = State.get(getOperand(1));
Expand Down Expand Up @@ -1044,6 +1055,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
case VPInstruction::CanonicalIVIncrementForPart:
case VPInstruction::ExtractLastElement:
case VPInstruction::ExtractPenultimateElement:
case VPInstruction::ExtractSubvector:
case VPInstruction::FirstActiveLane:
case VPInstruction::FirstOrderRecurrenceSplice:
case VPInstruction::LogicalAnd:
Expand Down Expand Up @@ -1174,6 +1186,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
case VPInstruction::ExtractPenultimateElement:
O << "extract-penultimate-element";
break;
case VPInstruction::ExtractSubvector:
O << "extract-subvector";
break;
case VPInstruction::ComputeAnyOfResult:
O << "compute-anyof-result";
break;
Expand Down
119 changes: 99 additions & 20 deletions llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
//===----------------------------------------------------------------------===//

#include "VPlanTransforms.h"
#include "LoopVectorizationPlanner.h"
#include "VPRecipeBuilder.h"
#include "VPlan.h"
#include "VPlanAnalysis.h"
Expand Down Expand Up @@ -1432,20 +1433,93 @@ static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan,
return SE.isKnownPredicate(CmpInst::ICMP_EQ, TripCount, C);
}

static void extractFromWideActiveLaneMask(VPlan &Plan, ElementCount VF,
unsigned UF) {
VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
auto *Term = &ExitingVPBB->back();

VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
LLVMContext &Ctx = CanonicalIV->getScalarType()->getContext();
using namespace llvm::VPlanPatternMatch;

auto extractFromALM = [&](VPInstruction *ALM, VPInstruction *InsBefore,
SmallVectorImpl<VPValue *> &Extracts) {
VPBuilder Builder(InsBefore);
DebugLoc DL = ALM->getDebugLoc();
for (unsigned Part = 0; Part < UF; ++Part) {
SmallVector<VPValue *> Ops;
Ops.append({ALM, Plan.getOrAddLiveIn(
ConstantInt::get(IntegerType::getInt64Ty(Ctx),
VF.getKnownMinValue() * Part))});
Extracts.push_back(
Builder.createNaryOp(VPInstruction::ExtractSubvector, Ops, DL));
}
};

// Create a list of each active lane mask phi, ordered by unroll part.
SmallVector<VPActiveLaneMaskPHIRecipe *> Phis(UF, nullptr);
for (VPRecipeBase &R : Header->phis())
if (auto *Phi = dyn_cast<VPActiveLaneMaskPHIRecipe>(&R))
Phis[Phi->getUnrollPart()] = Phi;

assert(all_of(Phis, [](VPActiveLaneMaskPHIRecipe *Phi) { return Phi; }) &&
"Expected one VPActiveLaneMaskPHIRecipe for each unroll part");

// When using wide lane masks, the return type of the get.active.lane.mask
// intrinsic is VF x UF (second operand).
VPValue *ALMMultiplier =
Plan.getOrAddLiveIn(ConstantInt::get(IntegerType::getInt64Ty(Ctx), UF));
cast<VPInstruction>(Phis[0]->getStartValue())->setOperand(2, ALMMultiplier);
cast<VPInstruction>(Phis[0]->getBackedgeValue())
->setOperand(2, ALMMultiplier);

// Create UF x extract vectors and insert into preheader.
SmallVector<VPValue *> EntryExtracts;
auto *EntryALM = cast<VPInstruction>(Phis[0]->getStartValue());
extractFromALM(EntryALM, cast<VPInstruction>(&EntryALM->getParent()->back()),
EntryExtracts);

// Create UF x extract vectors and insert before the loop compare & branch,
// updating the compare to use the first extract.
SmallVector<VPValue *> LoopExtracts;
auto *LoopALM = cast<VPInstruction>(Phis[0]->getBackedgeValue());
VPInstruction *Not = cast<VPInstruction>(Term->getOperand(0));
extractFromALM(LoopALM, Not, LoopExtracts);
Not->setOperand(0, LoopExtracts[0]);

// Update the incoming values of active lane mask phis.
for (unsigned Part = 0; Part < UF; ++Part) {
Phis[Part]->setStartValue(EntryExtracts[Part]);
Phis[Part]->setBackedgeValue(LoopExtracts[Part]);
}

return;
}

/// Try to simplify the branch condition of \p Plan. This may restrict the
/// resulting plan to \p BestVF and \p BestUF.
static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF,
unsigned BestUF,
PredicatedScalarEvolution &PSE) {
static bool
simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF,
unsigned BestUF,
PredicatedScalarEvolution &PSE,
bool DataAndControlFlowWithoutRuntimeCheck) {
VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
auto *Term = &ExitingVPBB->back();
VPValue *Cond;
ScalarEvolution &SE = *PSE.getSE();
using namespace llvm::VPlanPatternMatch;
if (match(Term, m_BranchOnCount(m_VPValue(), m_VPValue())) ||
match(Term, m_BranchOnCond(
m_Not(m_ActiveLaneMask(m_VPValue(), m_VPValue()))))) {
auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
bool BranchALM = match(Term, m_BranchOnCond(m_Not(m_ActiveLaneMask(
m_VPValue(), m_VPValue(), m_VPValue()))));

if (BranchALM || match(Term, m_BranchOnCount(m_VPValue(), m_VPValue()))) {
if (BranchALM && DataAndControlFlowWithoutRuntimeCheck &&
EnableWideActiveLaneMask && BestVF.isVector() && BestUF > 1)
extractFromWideActiveLaneMask(Plan, BestVF, BestUF);

// Try to simplify the branch condition if TC <= VF * UF when the latch
// terminator is BranchOnCount or BranchOnCond where the input is
// Not(ActiveLaneMask).
Expand All @@ -1470,7 +1544,6 @@ static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF,
// The vector loop region only executes once. If possible, completely remove
// the region, otherwise replace the terminator controlling the latch with
// (BranchOnCond true).
auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
auto *CanIVTy = Plan.getCanonicalIV()->getScalarType();
if (all_of(
Header->phis(),
Expand Down Expand Up @@ -1507,14 +1580,15 @@ static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF,
return true;
}

void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
unsigned BestUF,
PredicatedScalarEvolution &PSE) {
void VPlanTransforms::optimizeForVFAndUF(
VPlan &Plan, ElementCount BestVF, unsigned BestUF,
PredicatedScalarEvolution &PSE,
bool DataAndControlFlowWithoutRuntimeCheck) {
assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");

bool MadeChange =
simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
bool MadeChange = simplifyBranchConditionForVFAndUF(
Plan, BestVF, BestUF, PSE, DataAndControlFlowWithoutRuntimeCheck);
MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF);

if (MadeChange) {
Expand Down Expand Up @@ -2006,9 +2080,11 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
"index.part.next");

// Create the active lane mask instruction in the VPlan preheader.
auto *EntryALM =
Builder.createNaryOp(VPInstruction::ActiveLaneMask, {EntryIncrement, TC},
DL, "active.lane.mask.entry");
VPValue *ALMMultiplier = Plan.getOrAddLiveIn(
ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1));
auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
{EntryIncrement, TC, ALMMultiplier}, DL,
"active.lane.mask.entry");

// Now create the ActiveLaneMaskPhi recipe in the main loop using the
// preheader ActiveLaneMask instruction.
Expand All @@ -2023,8 +2099,8 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart,
{IncrementValue}, {false, false}, DL);
auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
{InLoopIncrement, TripCount}, DL,
"active.lane.mask.next");
{InLoopIncrement, TripCount, ALMMultiplier},
DL, "active.lane.mask.next");
LaneMaskPhi->addOperand(ALM);

// Replace the original terminator with BranchOnCond. We have to invert the
Expand Down Expand Up @@ -2101,9 +2177,12 @@ void VPlanTransforms::addActiveLaneMask(
Plan, DataAndControlFlowWithoutRuntimeCheck);
} else {
VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
LaneMask = B.createNaryOp(VPInstruction::ActiveLaneMask,
{WideCanonicalIV, Plan.getTripCount()}, nullptr,
"active.lane.mask");
VPValue *ALMMultiplier = Plan.getOrAddLiveIn(
ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1));
LaneMask =
B.createNaryOp(VPInstruction::ActiveLaneMask,
{WideCanonicalIV, Plan.getTripCount(), ALMMultiplier},
nullptr, "active.lane.mask");
}

// Walk users of WideCanonicalIV and replace all compares of the form
Expand Down
3 changes: 2 additions & 1 deletion llvm/lib/Transforms/Vectorize/VPlanTransforms.h
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,8 @@ struct VPlanTransforms {
/// resulting plan to \p BestVF and \p BestUF.
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
unsigned BestUF,
PredicatedScalarEvolution &PSE);
PredicatedScalarEvolution &PSE,
bool DataAndControlFlowWithoutRuntimeCheck);

/// Apply VPlan-to-VPlan optimizations to \p Plan, including induction recipe
/// optimizations, dead recipe removal, replicate region optimizations and
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,7 @@ void UnrollState::unrollHeaderPHIByUF(VPHeaderPHIRecipe *R,
} else {
assert(isa<VPActiveLaneMaskPHIRecipe>(R) &&
"unexpected header phi recipe not needing unrolled part");
cast<VPActiveLaneMaskPHIRecipe>(Copy)->setUnrollPart(Part);
}
}
}
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ bool vputils::isHeaderMask(const VPValue *V, VPlan &Plan) {
VPValue *A, *B;
using namespace VPlanPatternMatch;

if (match(V, m_ActiveLaneMask(m_VPValue(A), m_VPValue(B))))
if (match(V, m_ActiveLaneMask(m_VPValue(A), m_VPValue(B), m_SpecificInt(1))))
return B == Plan.getTripCount() &&
(match(A, m_ScalarIVSteps(m_Specific(Plan.getCanonicalIV()),
m_SpecificInt(1),
Expand Down
Loading
Loading