diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp index 63fccee63c0ae..1dff9c3513a28 100644 --- a/llvm/lib/Analysis/VectorUtils.cpp +++ b/llvm/lib/Analysis/VectorUtils.cpp @@ -163,6 +163,7 @@ bool llvm::isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, case Intrinsic::is_fpclass: case Intrinsic::vp_is_fpclass: case Intrinsic::powi: + case Intrinsic::vector_extract: return (ScalarOpdIdx == 1); case Intrinsic::smul_fix: case Intrinsic::smul_fix_sat: @@ -195,6 +196,7 @@ bool llvm::isVectorIntrinsicWithOverloadTypeAtArg( case Intrinsic::vp_llrint: case Intrinsic::ucmp: case Intrinsic::scmp: + case Intrinsic::vector_extract: return OpdIdx == -1 || OpdIdx == 0; case Intrinsic::modf: case Intrinsic::sincos: diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index 144f35e10132f..dd54d964f8883 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -44,6 +44,7 @@ class VPRecipeBuilder; struct VFRange; extern cl::opt EnableVPlanNativePath; +extern cl::opt EnableWideActiveLaneMask; extern cl::opt ForceTargetInstructionCost; /// VPlan-based builder utility analogous to IRBuilder. diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index e7bae17dd2ceb..6e5f4caf93d23 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -356,6 +356,10 @@ cl::opt llvm::EnableVPlanNativePath( cl::desc("Enable VPlan-native vectorization path with " "support for outer loop vectorization.")); +cl::opt llvm::EnableWideActiveLaneMask( + "enable-wide-lane-mask", cl::init(false), cl::Hidden, + cl::desc("Enable use of wide get active lane mask instructions")); + cl::opt llvm::VerifyEachVPlan("vplan-verify-each", #ifdef EXPENSIVE_CHECKS @@ -7328,7 +7332,10 @@ DenseMap LoopVectorizationPlanner::executePlan( VPlanTransforms::runPass(VPlanTransforms::addBranchWeightToMiddleTerminator, BestVPlan, BestVF, VScale); } - VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE); + VPlanTransforms::optimizeForVFAndUF( + BestVPlan, BestVF, BestUF, PSE, + ILV.Cost->getTailFoldingStyle() == + TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck); VPlanTransforms::simplifyRecipes(BestVPlan, *Legal->getWidestInductionType()); VPlanTransforms::narrowInterleaveGroups( BestVPlan, BestVF, diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 356af4a0e74e4..6080aa88ec306 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -954,6 +954,9 @@ class VPInstruction : public VPRecipeWithIRFlags, // part if it is scalar. In the latter case, the recipe will be removed // during unrolling. ExtractPenultimateElement, + // Extracts a subvector from a vector (first operand) starting at a given + // offset (second operand). + ExtractSubvector, LogicalAnd, // Non-poison propagating logical And. // Add an offset in bytes (second operand) to a base pointer (first // operand). Only generates scalar values (either for the first lane only or @@ -1887,6 +1890,9 @@ class VPHeaderPHIRecipe : public VPSingleDefRecipe, public VPPhiAccessors { return getOperand(1); } + // Update the incoming value from the loop backedge. + void setBackedgeValue(VPValue *V) { setOperand(1, V); } + /// Returns the backedge value as a recipe. The backedge value is guaranteed /// to be a recipe. virtual VPRecipeBase &getBackedgeRecipe() { @@ -3234,10 +3240,12 @@ class VPCanonicalIVPHIRecipe : public VPHeaderPHIRecipe { /// TODO: It would be good to use the existing VPWidenPHIRecipe instead and /// remove VPActiveLaneMaskPHIRecipe. class VPActiveLaneMaskPHIRecipe : public VPHeaderPHIRecipe { + unsigned UnrollPart = 0; + public: - VPActiveLaneMaskPHIRecipe(VPValue *StartMask, DebugLoc DL) - : VPHeaderPHIRecipe(VPDef::VPActiveLaneMaskPHISC, nullptr, StartMask, - DL) {} + VPActiveLaneMaskPHIRecipe(VPValue *StartMask, DebugLoc DL, unsigned Part = 0) + : VPHeaderPHIRecipe(VPDef::VPActiveLaneMaskPHISC, nullptr, StartMask, DL), + UnrollPart(Part) {} ~VPActiveLaneMaskPHIRecipe() override = default; @@ -3250,6 +3258,9 @@ class VPActiveLaneMaskPHIRecipe : public VPHeaderPHIRecipe { VP_CLASSOF_IMPL(VPDef::VPActiveLaneMaskPHISC) + unsigned getUnrollPart() { return UnrollPart; } + void setUnrollPart(unsigned Part) { UnrollPart = Part; } + /// Generate the active lane mask phi of the vector loop. void execute(VPTransformState &State) override; diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index 92db9674ef42b..5e7f797b70978 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -74,6 +74,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) { switch (Opcode) { case Instruction::ExtractElement: case Instruction::Freeze: + case VPInstruction::ExtractSubvector: case VPInstruction::ReductionStartVector: return inferScalarType(R->getOperand(0)); case Instruction::Select: { diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h index efea99f22d086..62898bf2c1991 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h +++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h @@ -384,10 +384,11 @@ m_Broadcast(const Op0_t &Op0) { return m_VPInstruction(Op0); } -template -inline BinaryVPInstruction_match -m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1) { - return m_VPInstruction(Op0, Op1); +template +inline TernaryVPInstruction_match +m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2) { + return m_VPInstruction(Op0, Op1, Op2); } template diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index ccb7512051d77..c776d5cb91278 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -469,15 +469,16 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) { case Instruction::ICmp: case Instruction::FCmp: case Instruction::Store: - case VPInstruction::ActiveLaneMask: case VPInstruction::BranchOnCount: case VPInstruction::ComputeReductionResult: + case VPInstruction::ExtractSubvector: case VPInstruction::FirstOrderRecurrenceSplice: case VPInstruction::LogicalAnd: case VPInstruction::PtrAdd: case VPInstruction::WideIVStep: return 2; case Instruction::Select: + case VPInstruction::ActiveLaneMask: case VPInstruction::ComputeAnyOfResult: case VPInstruction::ReductionStartVector: return 3; @@ -614,7 +615,9 @@ Value *VPInstruction::generate(VPTransformState &State) { Name); auto *Int1Ty = Type::getInt1Ty(Builder.getContext()); - auto *PredTy = VectorType::get(Int1Ty, State.VF); + auto PredTy = VectorType::get( + Int1Ty, State.VF * cast(getOperand(2)->getLiveInIRValue()) + ->getZExtValue()); return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, {PredTy, ScalarTC->getType()}, {VIVElem0, ScalarTC}, nullptr, Name); @@ -846,6 +849,14 @@ Value *VPInstruction::generate(VPTransformState &State) { Res->setName(Name); return Res; } + case VPInstruction::ExtractSubvector: { + Value *Vec = State.get(getOperand(0)); + assert(State.VF.isVector()); + auto Idx = cast(getOperand(1)->getLiveInIRValue()); + auto ResTy = VectorType::get( + State.TypeAnalysis.inferScalarType(getOperand(0)), State.VF); + return Builder.CreateExtractVector(ResTy, Vec, Idx); + } case VPInstruction::LogicalAnd: { Value *A = State.get(getOperand(0)); Value *B = State.get(getOperand(1)); @@ -1044,6 +1055,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const { case VPInstruction::CanonicalIVIncrementForPart: case VPInstruction::ExtractLastElement: case VPInstruction::ExtractPenultimateElement: + case VPInstruction::ExtractSubvector: case VPInstruction::FirstActiveLane: case VPInstruction::FirstOrderRecurrenceSplice: case VPInstruction::LogicalAnd: @@ -1174,6 +1186,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, case VPInstruction::ExtractPenultimateElement: O << "extract-penultimate-element"; break; + case VPInstruction::ExtractSubvector: + O << "extract-subvector"; + break; case VPInstruction::ComputeAnyOfResult: O << "compute-anyof-result"; break; diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 90137b72c83fb..b8f14ca88e8a3 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "VPlanTransforms.h" +#include "LoopVectorizationPlanner.h" #include "VPRecipeBuilder.h" #include "VPlan.h" #include "VPlanAnalysis.h" @@ -1432,20 +1433,93 @@ static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan, return SE.isKnownPredicate(CmpInst::ICMP_EQ, TripCount, C); } +static void extractFromWideActiveLaneMask(VPlan &Plan, ElementCount VF, + unsigned UF) { + VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion(); + auto *Header = cast(VectorRegion->getEntry()); + VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock(); + auto *Term = &ExitingVPBB->back(); + + VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV(); + LLVMContext &Ctx = CanonicalIV->getScalarType()->getContext(); + using namespace llvm::VPlanPatternMatch; + + auto extractFromALM = [&](VPInstruction *ALM, VPInstruction *InsBefore, + SmallVectorImpl &Extracts) { + VPBuilder Builder(InsBefore); + DebugLoc DL = ALM->getDebugLoc(); + for (unsigned Part = 0; Part < UF; ++Part) { + SmallVector Ops; + Ops.append({ALM, Plan.getOrAddLiveIn( + ConstantInt::get(IntegerType::getInt64Ty(Ctx), + VF.getKnownMinValue() * Part))}); + Extracts.push_back( + Builder.createNaryOp(VPInstruction::ExtractSubvector, Ops, DL)); + } + }; + + // Create a list of each active lane mask phi, ordered by unroll part. + SmallVector Phis(UF, nullptr); + for (VPRecipeBase &R : Header->phis()) + if (auto *Phi = dyn_cast(&R)) + Phis[Phi->getUnrollPart()] = Phi; + + assert(all_of(Phis, [](VPActiveLaneMaskPHIRecipe *Phi) { return Phi; }) && + "Expected one VPActiveLaneMaskPHIRecipe for each unroll part"); + + // When using wide lane masks, the return type of the get.active.lane.mask + // intrinsic is VF x UF (second operand). + VPValue *ALMMultiplier = + Plan.getOrAddLiveIn(ConstantInt::get(IntegerType::getInt64Ty(Ctx), UF)); + cast(Phis[0]->getStartValue())->setOperand(2, ALMMultiplier); + cast(Phis[0]->getBackedgeValue()) + ->setOperand(2, ALMMultiplier); + + // Create UF x extract vectors and insert into preheader. + SmallVector EntryExtracts; + auto *EntryALM = cast(Phis[0]->getStartValue()); + extractFromALM(EntryALM, cast(&EntryALM->getParent()->back()), + EntryExtracts); + + // Create UF x extract vectors and insert before the loop compare & branch, + // updating the compare to use the first extract. + SmallVector LoopExtracts; + auto *LoopALM = cast(Phis[0]->getBackedgeValue()); + VPInstruction *Not = cast(Term->getOperand(0)); + extractFromALM(LoopALM, Not, LoopExtracts); + Not->setOperand(0, LoopExtracts[0]); + + // Update the incoming values of active lane mask phis. + for (unsigned Part = 0; Part < UF; ++Part) { + Phis[Part]->setStartValue(EntryExtracts[Part]); + Phis[Part]->setBackedgeValue(LoopExtracts[Part]); + } + + return; +} + /// Try to simplify the branch condition of \p Plan. This may restrict the /// resulting plan to \p BestVF and \p BestUF. -static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, - unsigned BestUF, - PredicatedScalarEvolution &PSE) { +static bool +simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, + unsigned BestUF, + PredicatedScalarEvolution &PSE, + bool DataAndControlFlowWithoutRuntimeCheck) { VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion(); VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock(); auto *Term = &ExitingVPBB->back(); VPValue *Cond; ScalarEvolution &SE = *PSE.getSE(); using namespace llvm::VPlanPatternMatch; - if (match(Term, m_BranchOnCount(m_VPValue(), m_VPValue())) || - match(Term, m_BranchOnCond( - m_Not(m_ActiveLaneMask(m_VPValue(), m_VPValue()))))) { + auto *Header = cast(VectorRegion->getEntry()); + bool BranchALM = match(Term, m_BranchOnCond(m_Not(m_ActiveLaneMask( + m_VPValue(), m_VPValue(), m_VPValue())))); + + if (BranchALM || match(Term, m_BranchOnCount(m_VPValue(), m_VPValue()))) { + if (BranchALM && DataAndControlFlowWithoutRuntimeCheck && + EnableWideActiveLaneMask && BestVF.isVector() && BestUF > 1) + extractFromWideActiveLaneMask(Plan, BestVF, BestUF); + // Try to simplify the branch condition if TC <= VF * UF when the latch // terminator is BranchOnCount or BranchOnCond where the input is // Not(ActiveLaneMask). @@ -1470,7 +1544,6 @@ static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, // The vector loop region only executes once. If possible, completely remove // the region, otherwise replace the terminator controlling the latch with // (BranchOnCond true). - auto *Header = cast(VectorRegion->getEntry()); auto *CanIVTy = Plan.getCanonicalIV()->getScalarType(); if (all_of( Header->phis(), @@ -1507,14 +1580,15 @@ static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, return true; } -void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, - unsigned BestUF, - PredicatedScalarEvolution &PSE) { +void VPlanTransforms::optimizeForVFAndUF( + VPlan &Plan, ElementCount BestVF, unsigned BestUF, + PredicatedScalarEvolution &PSE, + bool DataAndControlFlowWithoutRuntimeCheck) { assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan"); assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan"); - bool MadeChange = - simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE); + bool MadeChange = simplifyBranchConditionForVFAndUF( + Plan, BestVF, BestUF, PSE, DataAndControlFlowWithoutRuntimeCheck); MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF); if (MadeChange) { @@ -2006,9 +2080,11 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch( "index.part.next"); // Create the active lane mask instruction in the VPlan preheader. - auto *EntryALM = - Builder.createNaryOp(VPInstruction::ActiveLaneMask, {EntryIncrement, TC}, - DL, "active.lane.mask.entry"); + VPValue *ALMMultiplier = Plan.getOrAddLiveIn( + ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1)); + auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask, + {EntryIncrement, TC, ALMMultiplier}, DL, + "active.lane.mask.entry"); // Now create the ActiveLaneMaskPhi recipe in the main loop using the // preheader ActiveLaneMask instruction. @@ -2023,8 +2099,8 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch( Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart, {IncrementValue}, {false, false}, DL); auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask, - {InLoopIncrement, TripCount}, DL, - "active.lane.mask.next"); + {InLoopIncrement, TripCount, ALMMultiplier}, + DL, "active.lane.mask.next"); LaneMaskPhi->addOperand(ALM); // Replace the original terminator with BranchOnCond. We have to invert the @@ -2101,9 +2177,12 @@ void VPlanTransforms::addActiveLaneMask( Plan, DataAndControlFlowWithoutRuntimeCheck); } else { VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV); - LaneMask = B.createNaryOp(VPInstruction::ActiveLaneMask, - {WideCanonicalIV, Plan.getTripCount()}, nullptr, - "active.lane.mask"); + VPValue *ALMMultiplier = Plan.getOrAddLiveIn( + ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1)); + LaneMask = + B.createNaryOp(VPInstruction::ActiveLaneMask, + {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier}, + nullptr, "active.lane.mask"); } // Walk users of WideCanonicalIV and replace all compares of the form diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index 8d2eded45da22..920c7aa32cc97 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -109,7 +109,8 @@ struct VPlanTransforms { /// resulting plan to \p BestVF and \p BestUF. static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, - PredicatedScalarEvolution &PSE); + PredicatedScalarEvolution &PSE, + bool DataAndControlFlowWithoutRuntimeCheck); /// Apply VPlan-to-VPlan optimizations to \p Plan, including induction recipe /// optimizations, dead recipe removal, replicate region optimizations and diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp index 2dd43c092ff7a..76a37d5ba839b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp @@ -250,6 +250,7 @@ void UnrollState::unrollHeaderPHIByUF(VPHeaderPHIRecipe *R, } else { assert(isa(R) && "unexpected header phi recipe not needing unrolled part"); + cast(Copy)->setUnrollPart(Part); } } } diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index 81bd21bb904c0..9fdc199fc1dfa 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -61,7 +61,7 @@ bool vputils::isHeaderMask(const VPValue *V, VPlan &Plan) { VPValue *A, *B; using namespace VPlanPatternMatch; - if (match(V, m_ActiveLaneMask(m_VPValue(A), m_VPValue(B)))) + if (match(V, m_ActiveLaneMask(m_VPValue(A), m_VPValue(B), m_SpecificInt(1)))) return B == Plan.getTripCount() && (match(A, m_ScalarIVSteps(m_Specific(Plan.getCanonicalIV()), m_SpecificInt(1), diff --git a/llvm/test/CodeGen/AArch64/sve-wide-lane-mask.ll b/llvm/test/CodeGen/AArch64/sve-wide-lane-mask.ll new file mode 100644 index 0000000000000..d59dbec491467 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-wide-lane-mask.ll @@ -0,0 +1,121 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mattr=+sve < %s | FileCheck %s -check-prefix CHECK-SVE +; RUN: llc -mattr=+sve2p1 < %s | FileCheck %s -check-prefix CHECK-SVE2p1 + +target triple = "aarch64-unknown-linux" + +define void @scalable_wide_active_lane_mask(ptr %dst, ptr %src, i64 %n) #0 { +; CHECK-SVE-LABEL: scalable_wide_active_lane_mask: +; CHECK-SVE: // %bb.0: // %entry +; CHECK-SVE-NEXT: cmp x2, #1 +; CHECK-SVE-NEXT: b.lt .LBB0_3 +; CHECK-SVE-NEXT: // %bb.1: // %vector.ph +; CHECK-SVE-NEXT: rdvl x8, #2 +; CHECK-SVE-NEXT: rdvl x9, #1 +; CHECK-SVE-NEXT: mov x11, xzr +; CHECK-SVE-NEXT: subs x10, x2, x8 +; CHECK-SVE-NEXT: csel x10, xzr, x10, lo +; CHECK-SVE-NEXT: whilelo p1.b, xzr, x2 +; CHECK-SVE-NEXT: whilelo p0.b, x9, x2 +; CHECK-SVE-NEXT: .LBB0_2: // %vector.body +; CHECK-SVE-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-SVE-NEXT: add x12, x1, x11 +; CHECK-SVE-NEXT: ld1b { z0.b }, p1/z, [x1, x11] +; CHECK-SVE-NEXT: add x13, x0, x11 +; CHECK-SVE-NEXT: ld1b { z1.b }, p0/z, [x12, #1, mul vl] +; CHECK-SVE-NEXT: adds x12, x11, x9 +; CHECK-SVE-NEXT: csinv x12, x12, xzr, lo +; CHECK-SVE-NEXT: mul z0.b, z0.b, #3 +; CHECK-SVE-NEXT: mul z1.b, z1.b, #3 +; CHECK-SVE-NEXT: st1b { z0.b }, p1, [x0, x11] +; CHECK-SVE-NEXT: st1b { z1.b }, p0, [x13, #1, mul vl] +; CHECK-SVE-NEXT: whilelo p0.b, x12, x10 +; CHECK-SVE-NEXT: whilelo p1.b, x11, x10 +; CHECK-SVE-NEXT: add x11, x11, x8 +; CHECK-SVE-NEXT: b.mi .LBB0_2 +; CHECK-SVE-NEXT: .LBB0_3: // %for.end +; CHECK-SVE-NEXT: ret +; +; CHECK-SVE2p1-LABEL: scalable_wide_active_lane_mask: +; CHECK-SVE2p1: // %bb.0: // %entry +; CHECK-SVE2p1-NEXT: cmp x2, #1 +; CHECK-SVE2p1-NEXT: b.lt .LBB0_3 +; CHECK-SVE2p1-NEXT: // %bb.1: // %vector.ph +; CHECK-SVE2p1-NEXT: rdvl x9, #2 +; CHECK-SVE2p1-NEXT: mov x8, xzr +; CHECK-SVE2p1-NEXT: subs x9, x2, x9 +; CHECK-SVE2p1-NEXT: csel x9, xzr, x9, lo +; CHECK-SVE2p1-NEXT: whilelo { p0.b, p1.b }, xzr, x2 +; CHECK-SVE2p1-NEXT: .LBB0_2: // %vector.body +; CHECK-SVE2p1-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-SVE2p1-NEXT: add x10, x1, x8 +; CHECK-SVE2p1-NEXT: ld1b { z0.b }, p0/z, [x1, x8] +; CHECK-SVE2p1-NEXT: ld1b { z1.b }, p1/z, [x10, #1, mul vl] +; CHECK-SVE2p1-NEXT: add x10, x0, x8 +; CHECK-SVE2p1-NEXT: mul z0.b, z0.b, #3 +; CHECK-SVE2p1-NEXT: mul z1.b, z1.b, #3 +; CHECK-SVE2p1-NEXT: st1b { z0.b }, p0, [x0, x8] +; CHECK-SVE2p1-NEXT: st1b { z1.b }, p1, [x10, #1, mul vl] +; CHECK-SVE2p1-NEXT: whilelo { p0.b, p1.b }, x8, x9 +; CHECK-SVE2p1-NEXT: incb x8, all, mul #2 +; CHECK-SVE2p1-NEXT: mov z0.b, p0/z, #1 // =0x1 +; CHECK-SVE2p1-NEXT: fmov w10, s0 +; CHECK-SVE2p1-NEXT: tbnz w10, #0, .LBB0_2 +; CHECK-SVE2p1-NEXT: .LBB0_3: // %for.end +; CHECK-SVE2p1-NEXT: ret +entry: + %cmp = icmp sgt i64 %n, 0 + br i1 %cmp, label %vector.ph, label %for.end + +vector.ph: + %0 = tail call i64 @llvm.vscale.i64() + %1 = shl nuw nsw i64 %0, 5 + %2 = tail call i64 @llvm.vscale.i64() + %3 = shl nuw nsw i64 %2, 5 + %4 = tail call i64 @llvm.usub.sat.i64(i64 %n, i64 %3) + %active.lane.mask.entry = tail call @llvm.get.active.lane.mask.nxv32i1.i64(i64 0, i64 %n) + %5 = tail call @llvm.vector.extract.nxv16i1.nxv32i1( %active.lane.mask.entry, i64 0) + %6 = tail call @llvm.vector.extract.nxv16i1.nxv32i1( %active.lane.mask.entry, i64 16) + %7 = tail call i64 @llvm.vscale.i64() + %8 = shl nuw nsw i64 %7, 4 + %9 = tail call i64 @llvm.vscale.i64() + %10 = shl nuw nsw i64 %9, 4 + br label %vector.body + +vector.body: + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %active.lane.mask = phi [ %5, %vector.ph ], [ %17, %vector.body ] + %active.lane.mask2 = phi [ %6, %vector.ph ], [ %18, %vector.body ] + %11 = getelementptr inbounds nuw i8, ptr %src, i64 %index + %12 = getelementptr inbounds nuw i8, ptr %11, i64 %8 + %wide.masked.load = tail call @llvm.masked.load.nxv16i8.p0(ptr %11, i32 1, %active.lane.mask, poison) + %wide.masked.load3 = tail call @llvm.masked.load.nxv16i8.p0(ptr nonnull %12, i32 1, %active.lane.mask2, poison) + %13 = mul %wide.masked.load, splat (i8 3) + %14 = mul %wide.masked.load3, splat (i8 3) + %15 = getelementptr inbounds nuw i8, ptr %dst, i64 %index + %16 = getelementptr inbounds nuw i8, ptr %15, i64 %10 + tail call void @llvm.masked.store.nxv16i8.p0( %13, ptr %15, i32 1, %active.lane.mask) + tail call void @llvm.masked.store.nxv16i8.p0( %14, ptr %16, i32 1, %active.lane.mask2) + %index.next = add i64 %index, %1 + %active.lane.mask.next = tail call @llvm.get.active.lane.mask.nxv32i1.i64(i64 %index, i64 %4) + %17 = tail call @llvm.vector.extract.nxv16i1.nxv32i1( %active.lane.mask.next, i64 0) + %18 = tail call @llvm.vector.extract.nxv16i1.nxv32i1( %active.lane.mask.next, i64 16) + %19 = extractelement %17, i64 0 + br i1 %19, label %vector.body, label %for.end, !llvm.loop !0 + +for.end: + ret void +} + +declare i64 @llvm.vscale.i64() +declare @llvm.get.active.lane.mask.nxv32i1.i64(i64, i64) +declare @llvm.vector.extract.nxv16i1.nxv32i1(, i64 immarg) +declare @llvm.masked.load.nxv16i8.p0(ptr captures(none), i32 immarg, , ) +declare void @llvm.masked.store.nxv16i8.p0(, ptr captures(none), i32 immarg, ) +declare i64 @llvm.usub.sat.i64(i64, i64) + +attributes #0 = { vscale_range(1,16) } + +!0 = distinct !{!0, !1, !2} +!1 = !{!"llvm.loop.isvectorized", i32 1} +!2 = !{!"llvm.loop.unroll.runtime.disable"} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fixed-wide-lane-mask.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fixed-wide-lane-mask.ll new file mode 100644 index 0000000000000..52128de119b9b --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/fixed-wide-lane-mask.ll @@ -0,0 +1,129 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -S --passes="default" -scalable-vectorization=off -prefer-predicate-over-epilogue=predicate-dont-vectorize -enable-wide-lane-mask \ +; RUN: -force-vector-width=4 -force-vector-interleave=0 < %s | FileCheck %s -check-prefix CHECK-UF0 +; RUN: opt -S --passes="default" -scalable-vectorization=off -prefer-predicate-over-epilogue=predicate-dont-vectorize -enable-wide-lane-mask \ +; RUN: -force-vector-width=4 -force-vector-interleave=2 < %s | FileCheck %s -check-prefix CHECK-UF2 +; RUN: opt -S --passes="default" -scalable-vectorization=off -prefer-predicate-over-epilogue=predicate-dont-vectorize -enable-wide-lane-mask \ +; RUN: -force-vector-width=4 -force-vector-interleave=4 < %s | FileCheck %s -check-prefix CHECK-UF4 + +target triple = "aarch64-unknown-linux" + +define void @fixed_wide_active_lane_mask(ptr noalias %dst, ptr noalias readonly %src, i64 %n) #0 { +; CHECK-UF0-LABEL: define void @fixed_wide_active_lane_mask( +; CHECK-UF0-SAME: ptr noalias writeonly captures(none) [[DST:%.*]], ptr noalias readonly captures(none) [[SRC:%.*]], i64 [[N:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; CHECK-UF0-NEXT: entry: +; CHECK-UF0-NEXT: [[LD:%.*]] = load i32, ptr [[SRC]], align 4 +; CHECK-UF0-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.usub.sat.i64(i64 [[N]], i64 4) +; CHECK-UF0-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 0, i64 [[N]]) +; CHECK-UF0-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[LD]], i64 0 +; CHECK-UF0-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-UF0-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-UF0: vector.body: +; CHECK-UF0-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-UF0-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-UF0-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i32, ptr [[DST]], i64 [[INDEX]] +; CHECK-UF0-NEXT: tail call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP2]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]]) +; CHECK-UF0-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; CHECK-UF0-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = tail call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 [[TMP0]]) +; CHECK-UF0-NEXT: [[TMP3:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; CHECK-UF0-NEXT: br i1 [[TMP3]], label [[VECTOR_BODY]], label [[FOR_END:%.*]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-UF0: for.end: +; CHECK-UF0-NEXT: ret void +; +; CHECK-UF2-LABEL: define void @fixed_wide_active_lane_mask( +; CHECK-UF2-SAME: ptr noalias writeonly captures(none) [[DST:%.*]], ptr noalias readonly captures(none) [[SRC:%.*]], i64 [[N:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; CHECK-UF2-NEXT: entry: +; CHECK-UF2-NEXT: [[LD:%.*]] = load i32, ptr [[SRC]], align 4 +; CHECK-UF2-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.usub.sat.i64(i64 [[N]], i64 8) +; CHECK-UF2-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 0, i64 [[N]]) +; CHECK-UF2-NEXT: [[TMP1:%.*]] = shufflevector <8 x i1> [[ACTIVE_LANE_MASK_ENTRY]], <8 x i1> poison, <4 x i32> +; CHECK-UF2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i1> [[ACTIVE_LANE_MASK_ENTRY]], <8 x i1> poison, <4 x i32> +; CHECK-UF2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[LD]], i64 0 +; CHECK-UF2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-UF2-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-UF2: vector.body: +; CHECK-UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-UF2-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[TMP1]], [[ENTRY]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-UF2-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi <4 x i1> [ [[TMP2]], [[ENTRY]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] +; CHECK-UF2-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i32, ptr [[DST]], i64 [[INDEX]] +; CHECK-UF2-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP3]], i64 16 +; CHECK-UF2-NEXT: tail call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]]) +; CHECK-UF2-NEXT: tail call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP4]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK2]]) +; CHECK-UF2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 +; CHECK-UF2-NEXT: [[ACTIVE_LANE_MASK_NEXT:%.*]] = tail call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 [[INDEX]], i64 [[TMP0]]) +; CHECK-UF2-NEXT: [[TMP5]] = shufflevector <8 x i1> [[ACTIVE_LANE_MASK_NEXT]], <8 x i1> poison, <4 x i32> +; CHECK-UF2-NEXT: [[TMP6]] = shufflevector <8 x i1> [[ACTIVE_LANE_MASK_NEXT]], <8 x i1> poison, <4 x i32> +; CHECK-UF2-NEXT: [[TMP7:%.*]] = extractelement <8 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; CHECK-UF2-NEXT: br i1 [[TMP7]], label [[VECTOR_BODY]], label [[FOR_END:%.*]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-UF2: for.end: +; CHECK-UF2-NEXT: ret void +; +; CHECK-UF4-LABEL: define void @fixed_wide_active_lane_mask( +; CHECK-UF4-SAME: ptr noalias writeonly captures(none) [[DST:%.*]], ptr noalias readonly captures(none) [[SRC:%.*]], i64 [[N:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; CHECK-UF4-NEXT: entry: +; CHECK-UF4-NEXT: [[LD:%.*]] = load i32, ptr [[SRC]], align 4 +; CHECK-UF4-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.usub.sat.i64(i64 [[N]], i64 16) +; CHECK-UF4-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 0, i64 [[N]]) +; CHECK-UF4-NEXT: [[TMP1:%.*]] = shufflevector <16 x i1> [[ACTIVE_LANE_MASK_ENTRY]], <16 x i1> poison, <4 x i32> +; CHECK-UF4-NEXT: [[TMP2:%.*]] = shufflevector <16 x i1> [[ACTIVE_LANE_MASK_ENTRY]], <16 x i1> poison, <4 x i32> +; CHECK-UF4-NEXT: [[TMP3:%.*]] = shufflevector <16 x i1> [[ACTIVE_LANE_MASK_ENTRY]], <16 x i1> poison, <4 x i32> +; CHECK-UF4-NEXT: [[TMP4:%.*]] = shufflevector <16 x i1> [[ACTIVE_LANE_MASK_ENTRY]], <16 x i1> poison, <4 x i32> +; CHECK-UF4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[LD]], i64 0 +; CHECK-UF4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-UF4-NEXT: br label [[VECTOR_BODY1:%.*]] +; CHECK-UF4: vector.body: +; CHECK-UF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY1]] ] +; CHECK-UF4-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[TMP1]], [[ENTRY]] ], [ [[TMP9:%.*]], [[VECTOR_BODY1]] ] +; CHECK-UF4-NEXT: [[ACTIVE_LANE_MASK4:%.*]] = phi <4 x i1> [ [[TMP2]], [[ENTRY]] ], [ [[TMP10:%.*]], [[VECTOR_BODY1]] ] +; CHECK-UF4-NEXT: [[ACTIVE_LANE_MASK5:%.*]] = phi <4 x i1> [ [[TMP3]], [[ENTRY]] ], [ [[TMP11:%.*]], [[VECTOR_BODY1]] ] +; CHECK-UF4-NEXT: [[ACTIVE_LANE_MASK6:%.*]] = phi <4 x i1> [ [[TMP4]], [[ENTRY]] ], [ [[TMP12:%.*]], [[VECTOR_BODY1]] ] +; CHECK-UF4-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i32, ptr [[DST]], i64 [[INDEX]] +; CHECK-UF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP5]], i64 16 +; CHECK-UF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP5]], i64 32 +; CHECK-UF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP5]], i64 48 +; CHECK-UF4-NEXT: tail call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP5]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]]) +; CHECK-UF4-NEXT: tail call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP6]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK4]]) +; CHECK-UF4-NEXT: tail call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP7]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK5]]) +; CHECK-UF4-NEXT: tail call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP8]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK6]]) +; CHECK-UF4-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 +; CHECK-UF4-NEXT: [[ACTIVE_LANE_MASK_NEXT:%.*]] = tail call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 [[INDEX]], i64 [[TMP0]]) +; CHECK-UF4-NEXT: [[TMP9]] = shufflevector <16 x i1> [[ACTIVE_LANE_MASK_NEXT]], <16 x i1> poison, <4 x i32> +; CHECK-UF4-NEXT: [[TMP10]] = shufflevector <16 x i1> [[ACTIVE_LANE_MASK_NEXT]], <16 x i1> poison, <4 x i32> +; CHECK-UF4-NEXT: [[TMP11]] = shufflevector <16 x i1> [[ACTIVE_LANE_MASK_NEXT]], <16 x i1> poison, <4 x i32> +; CHECK-UF4-NEXT: [[TMP12]] = shufflevector <16 x i1> [[ACTIVE_LANE_MASK_NEXT]], <16 x i1> poison, <4 x i32> +; CHECK-UF4-NEXT: [[TMP13:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; CHECK-UF4-NEXT: br i1 [[TMP13]], label [[VECTOR_BODY1]], label [[FOR_END:%.*]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-UF4: for.end: +; CHECK-UF4-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %ld = load i32, ptr %src + %arrayidx = getelementptr inbounds i32, ptr %dst, i64 %iv + store i32 %ld, ptr %arrayidx + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: + ret void +} + +attributes #0 = { nounwind "target-features"="+neon,+sve" } + +;. +; CHECK-UF0: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK-UF0: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-UF0: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +;. +; CHECK-UF2: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK-UF2: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-UF2: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +;. +; CHECK-UF4: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK-UF4: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-UF4: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +;. diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-wide-lane-mask.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-wide-lane-mask.ll new file mode 100644 index 0000000000000..790b78f9002d2 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-wide-lane-mask.ll @@ -0,0 +1,335 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -S --passes="default" -prefer-predicate-over-epilogue=predicate-dont-vectorize -enable-wide-lane-mask -force-vector-interleave=0 < %s | FileCheck %s -check-prefix CHECK-SVE-UF0 +; RUN: opt -S --passes="default" -prefer-predicate-over-epilogue=predicate-dont-vectorize -enable-wide-lane-mask -force-vector-interleave=2 < %s | FileCheck %s -check-prefix CHECK-SVE-UF2 +; RUN: opt -S --passes="default" -prefer-predicate-over-epilogue=predicate-dont-vectorize -enable-wide-lane-mask -force-vector-interleave=4 < %s | FileCheck %s -check-prefix CHECK-SVE-UF4 + +target triple = "aarch64-unknown-linux" + +define void @scalable_wide_active_lane_mask(ptr noalias %dst, ptr readonly %src, i64 %n) #0 { +; CHECK-SVE-UF0-LABEL: define void @scalable_wide_active_lane_mask( +; CHECK-SVE-UF0-SAME: ptr noalias writeonly captures(none) [[DST:%.*]], ptr readonly captures(none) [[SRC:%.*]], i64 [[N:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; CHECK-SVE-UF0-NEXT: entry: +; CHECK-SVE-UF0-NEXT: [[CMP7:%.*]] = icmp sgt i64 [[N]], 0 +; CHECK-SVE-UF0-NEXT: br i1 [[CMP7]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK-SVE-UF0: vector.ph: +; CHECK-SVE-UF0-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE-UF0-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4 +; CHECK-SVE-UF0-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE-UF0-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 4 +; CHECK-SVE-UF0-NEXT: [[TMP4:%.*]] = tail call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP3]]) +; CHECK-SVE-UF0-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[N]]) +; CHECK-SVE-UF0-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-SVE-UF0: vector.body: +; CHECK-SVE-UF0-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-UF0-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-UF0-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[SRC]], i64 [[INDEX]] +; CHECK-SVE-UF0-NEXT: [[WIDE_MASKED_LOAD:%.*]] = tail call @llvm.masked.load.nxv16i8.p0(ptr [[TMP5]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK-SVE-UF0-NEXT: [[TMP6:%.*]] = mul [[WIDE_MASKED_LOAD]], splat (i8 3) +; CHECK-SVE-UF0-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[DST]], i64 [[INDEX]] +; CHECK-SVE-UF0-NEXT: tail call void @llvm.masked.store.nxv16i8.p0( [[TMP6]], ptr [[TMP7]], i32 1, [[ACTIVE_LANE_MASK]]) +; CHECK-SVE-UF0-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] +; CHECK-SVE-UF0-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP4]]) +; CHECK-SVE-UF0-NEXT: [[TMP8:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; CHECK-SVE-UF0-NEXT: br i1 [[TMP8]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-SVE-UF0: for.end: +; CHECK-SVE-UF0-NEXT: ret void +; +; CHECK-SVE-UF2-LABEL: define void @scalable_wide_active_lane_mask( +; CHECK-SVE-UF2-SAME: ptr noalias writeonly captures(none) [[DST:%.*]], ptr readonly captures(none) [[SRC:%.*]], i64 [[N:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; CHECK-SVE-UF2-NEXT: entry: +; CHECK-SVE-UF2-NEXT: [[CMP7:%.*]] = icmp sgt i64 [[N]], 0 +; CHECK-SVE-UF2-NEXT: br i1 [[CMP7]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK-SVE-UF2: vector.ph: +; CHECK-SVE-UF2-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE-UF2-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 5 +; CHECK-SVE-UF2-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE-UF2-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 5 +; CHECK-SVE-UF2-NEXT: [[TMP4:%.*]] = tail call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP3]]) +; CHECK-SVE-UF2-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call @llvm.get.active.lane.mask.nxv32i1.i64(i64 0, i64 [[N]]) +; CHECK-SVE-UF2-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv16i1.nxv32i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 0) +; CHECK-SVE-UF2-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv16i1.nxv32i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 16) +; CHECK-SVE-UF2-NEXT: [[TMP7:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE-UF2-NEXT: [[TMP8:%.*]] = shl nuw nsw i64 [[TMP7]], 4 +; CHECK-SVE-UF2-NEXT: [[TMP9:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE-UF2-NEXT: [[TMP10:%.*]] = shl nuw nsw i64 [[TMP9]], 4 +; CHECK-SVE-UF2-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-SVE-UF2: vector.body: +; CHECK-SVE-UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-UF2-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[TMP5]], [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-UF2-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[TMP6]], [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-UF2-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[SRC]], i64 [[INDEX]] +; CHECK-SVE-UF2-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP11]], i64 [[TMP8]] +; CHECK-SVE-UF2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = tail call @llvm.masked.load.nxv16i8.p0(ptr [[TMP11]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK-SVE-UF2-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = tail call @llvm.masked.load.nxv16i8.p0(ptr nonnull [[TMP12]], i32 1, [[ACTIVE_LANE_MASK2]], poison) +; CHECK-SVE-UF2-NEXT: [[TMP13:%.*]] = mul [[WIDE_MASKED_LOAD]], splat (i8 3) +; CHECK-SVE-UF2-NEXT: [[TMP14:%.*]] = mul [[WIDE_MASKED_LOAD3]], splat (i8 3) +; CHECK-SVE-UF2-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw i8, ptr [[DST]], i64 [[INDEX]] +; CHECK-SVE-UF2-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP15]], i64 [[TMP10]] +; CHECK-SVE-UF2-NEXT: tail call void @llvm.masked.store.nxv16i8.p0( [[TMP13]], ptr [[TMP15]], i32 1, [[ACTIVE_LANE_MASK]]) +; CHECK-SVE-UF2-NEXT: tail call void @llvm.masked.store.nxv16i8.p0( [[TMP14]], ptr [[TMP16]], i32 1, [[ACTIVE_LANE_MASK2]]) +; CHECK-SVE-UF2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] +; CHECK-SVE-UF2-NEXT: [[ACTIVE_LANE_MASK_NEXT:%.*]] = tail call @llvm.get.active.lane.mask.nxv32i1.i64(i64 [[INDEX]], i64 [[TMP4]]) +; CHECK-SVE-UF2-NEXT: [[TMP17]] = tail call @llvm.vector.extract.nxv16i1.nxv32i1( [[ACTIVE_LANE_MASK_NEXT]], i64 0) +; CHECK-SVE-UF2-NEXT: [[TMP18]] = tail call @llvm.vector.extract.nxv16i1.nxv32i1( [[ACTIVE_LANE_MASK_NEXT]], i64 16) +; CHECK-SVE-UF2-NEXT: [[TMP19:%.*]] = extractelement [[TMP17]], i64 0 +; CHECK-SVE-UF2-NEXT: br i1 [[TMP19]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-SVE-UF2: for.end: +; CHECK-SVE-UF2-NEXT: ret void +; +; CHECK-SVE-UF4-LABEL: define void @scalable_wide_active_lane_mask( +; CHECK-SVE-UF4-SAME: ptr noalias writeonly captures(none) [[DST:%.*]], ptr readonly captures(none) [[SRC:%.*]], i64 [[N:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; CHECK-SVE-UF4-NEXT: entry: +; CHECK-SVE-UF4-NEXT: [[CMP7:%.*]] = icmp sgt i64 [[N]], 0 +; CHECK-SVE-UF4-NEXT: br i1 [[CMP7]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK-SVE-UF4: vector.ph: +; CHECK-SVE-UF4-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE-UF4-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 6 +; CHECK-SVE-UF4-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE-UF4-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 6 +; CHECK-SVE-UF4-NEXT: [[TMP4:%.*]] = tail call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP3]]) +; CHECK-SVE-UF4-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call @llvm.get.active.lane.mask.nxv64i1.i64(i64 0, i64 [[N]]) +; CHECK-SVE-UF4-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv16i1.nxv64i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 0) +; CHECK-SVE-UF4-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv16i1.nxv64i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 16) +; CHECK-SVE-UF4-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv16i1.nxv64i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 32) +; CHECK-SVE-UF4-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.extract.nxv16i1.nxv64i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 48) +; CHECK-SVE-UF4-NEXT: [[TMP9:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE-UF4-NEXT: [[TMP10:%.*]] = shl nuw nsw i64 [[TMP9]], 4 +; CHECK-SVE-UF4-NEXT: [[TMP11:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE-UF4-NEXT: [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 5 +; CHECK-SVE-UF4-NEXT: [[TMP13:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE-UF4-NEXT: [[TMP14:%.*]] = mul nuw nsw i64 [[TMP13]], 48 +; CHECK-SVE-UF4-NEXT: [[TMP15:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE-UF4-NEXT: [[TMP16:%.*]] = shl nuw nsw i64 [[TMP15]], 4 +; CHECK-SVE-UF4-NEXT: [[TMP17:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE-UF4-NEXT: [[TMP18:%.*]] = shl nuw nsw i64 [[TMP17]], 5 +; CHECK-SVE-UF4-NEXT: [[TMP19:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE-UF4-NEXT: [[TMP20:%.*]] = mul nuw nsw i64 [[TMP19]], 48 +; CHECK-SVE-UF4-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-SVE-UF4: vector.body: +; CHECK-SVE-UF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-UF4-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[TMP5]], [[VECTOR_PH]] ], [ [[TMP33:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-UF4-NEXT: [[ACTIVE_LANE_MASK6:%.*]] = phi [ [[TMP6]], [[VECTOR_PH]] ], [ [[TMP34:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-UF4-NEXT: [[ACTIVE_LANE_MASK7:%.*]] = phi [ [[TMP7]], [[VECTOR_PH]] ], [ [[TMP35:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-UF4-NEXT: [[ACTIVE_LANE_MASK8:%.*]] = phi [ [[TMP8]], [[VECTOR_PH]] ], [ [[TMP36:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-UF4-NEXT: [[TMP21:%.*]] = getelementptr inbounds nuw i8, ptr [[SRC]], i64 [[INDEX]] +; CHECK-SVE-UF4-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP21]], i64 [[TMP10]] +; CHECK-SVE-UF4-NEXT: [[TMP23:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP21]], i64 [[TMP12]] +; CHECK-SVE-UF4-NEXT: [[TMP24:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP21]], i64 [[TMP14]] +; CHECK-SVE-UF4-NEXT: [[WIDE_MASKED_LOAD:%.*]] = tail call @llvm.masked.load.nxv16i8.p0(ptr [[TMP21]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK-SVE-UF4-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = tail call @llvm.masked.load.nxv16i8.p0(ptr nonnull [[TMP22]], i32 1, [[ACTIVE_LANE_MASK6]], poison) +; CHECK-SVE-UF4-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = tail call @llvm.masked.load.nxv16i8.p0(ptr nonnull [[TMP23]], i32 1, [[ACTIVE_LANE_MASK7]], poison) +; CHECK-SVE-UF4-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = tail call @llvm.masked.load.nxv16i8.p0(ptr nonnull [[TMP24]], i32 1, [[ACTIVE_LANE_MASK8]], poison) +; CHECK-SVE-UF4-NEXT: [[TMP25:%.*]] = mul [[WIDE_MASKED_LOAD]], splat (i8 3) +; CHECK-SVE-UF4-NEXT: [[TMP26:%.*]] = mul [[WIDE_MASKED_LOAD9]], splat (i8 3) +; CHECK-SVE-UF4-NEXT: [[TMP27:%.*]] = mul [[WIDE_MASKED_LOAD10]], splat (i8 3) +; CHECK-SVE-UF4-NEXT: [[TMP28:%.*]] = mul [[WIDE_MASKED_LOAD11]], splat (i8 3) +; CHECK-SVE-UF4-NEXT: [[TMP29:%.*]] = getelementptr inbounds nuw i8, ptr [[DST]], i64 [[INDEX]] +; CHECK-SVE-UF4-NEXT: [[TMP30:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP29]], i64 [[TMP16]] +; CHECK-SVE-UF4-NEXT: [[TMP31:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP29]], i64 [[TMP18]] +; CHECK-SVE-UF4-NEXT: [[TMP32:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP29]], i64 [[TMP20]] +; CHECK-SVE-UF4-NEXT: tail call void @llvm.masked.store.nxv16i8.p0( [[TMP25]], ptr [[TMP29]], i32 1, [[ACTIVE_LANE_MASK]]) +; CHECK-SVE-UF4-NEXT: tail call void @llvm.masked.store.nxv16i8.p0( [[TMP26]], ptr [[TMP30]], i32 1, [[ACTIVE_LANE_MASK6]]) +; CHECK-SVE-UF4-NEXT: tail call void @llvm.masked.store.nxv16i8.p0( [[TMP27]], ptr [[TMP31]], i32 1, [[ACTIVE_LANE_MASK7]]) +; CHECK-SVE-UF4-NEXT: tail call void @llvm.masked.store.nxv16i8.p0( [[TMP28]], ptr [[TMP32]], i32 1, [[ACTIVE_LANE_MASK8]]) +; CHECK-SVE-UF4-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] +; CHECK-SVE-UF4-NEXT: [[ACTIVE_LANE_MASK_NEXT:%.*]] = tail call @llvm.get.active.lane.mask.nxv64i1.i64(i64 [[INDEX]], i64 [[TMP4]]) +; CHECK-SVE-UF4-NEXT: [[TMP33]] = tail call @llvm.vector.extract.nxv16i1.nxv64i1( [[ACTIVE_LANE_MASK_NEXT]], i64 0) +; CHECK-SVE-UF4-NEXT: [[TMP34]] = tail call @llvm.vector.extract.nxv16i1.nxv64i1( [[ACTIVE_LANE_MASK_NEXT]], i64 16) +; CHECK-SVE-UF4-NEXT: [[TMP35]] = tail call @llvm.vector.extract.nxv16i1.nxv64i1( [[ACTIVE_LANE_MASK_NEXT]], i64 32) +; CHECK-SVE-UF4-NEXT: [[TMP36]] = tail call @llvm.vector.extract.nxv16i1.nxv64i1( [[ACTIVE_LANE_MASK_NEXT]], i64 48) +; CHECK-SVE-UF4-NEXT: [[TMP37:%.*]] = extractelement [[TMP33]], i64 0 +; CHECK-SVE-UF4-NEXT: br i1 [[TMP37]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-SVE-UF4: for.end: +; CHECK-SVE-UF4-NEXT: ret void +; +entry: + %cmp = icmp sgt i64 %n, 0 + br i1 %cmp, label %for.body, label %for.end + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidx1 = getelementptr inbounds i8, ptr %src, i64 %iv + %ld = load i8, ptr %arrayidx1 + %mul = mul i8 %ld, 3 + %arrayidx2 = getelementptr inbounds i8, ptr %dst, i64 %iv + store i8 %mul, ptr %arrayidx2 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: + ret void +} + +define void @scalable_wide_active_lane_mask_float(ptr noalias %dst, ptr readonly %src, i32 %n) #0 { +; CHECK-SVE-UF0-LABEL: define void @scalable_wide_active_lane_mask_float( +; CHECK-SVE-UF0-SAME: ptr noalias writeonly captures(none) [[DST:%.*]], ptr readonly captures(none) [[SRC:%.*]], i32 [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CHECK-SVE-UF0-NEXT: entry: +; CHECK-SVE-UF0-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-SVE-UF0-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK-SVE-UF0: for.body.preheader: +; CHECK-SVE-UF0-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64 +; CHECK-SVE-UF0-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE-UF0-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 1 +; CHECK-SVE-UF0-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]]) +; CHECK-SVE-UF0-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-SVE-UF0: vector.body: +; CHECK-SVE-UF0-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-UF0-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-UF0-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw double, ptr [[SRC]], i64 [[INDEX]] +; CHECK-SVE-UF0-NEXT: [[WIDE_MASKED_LOAD:%.*]] = tail call @llvm.masked.load.nxv2f64.p0(ptr [[TMP2]], i32 8, [[ACTIVE_LANE_MASK]], poison) +; CHECK-SVE-UF0-NEXT: [[TMP3:%.*]] = fmul [[WIDE_MASKED_LOAD]], splat (double 3.000000e+00) +; CHECK-SVE-UF0-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw double, ptr [[DST]], i64 [[INDEX]] +; CHECK-SVE-UF0-NEXT: tail call void @llvm.masked.store.nxv2f64.p0( [[TMP3]], ptr [[TMP4]], i32 8, [[ACTIVE_LANE_MASK]]) +; CHECK-SVE-UF0-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] +; CHECK-SVE-UF0-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = tail call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]]) +; CHECK-SVE-UF0-NEXT: [[TMP5:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; CHECK-SVE-UF0-NEXT: br i1 [[TMP5]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-SVE-UF0: for.end: +; CHECK-SVE-UF0-NEXT: ret void +; +; CHECK-SVE-UF2-LABEL: define void @scalable_wide_active_lane_mask_float( +; CHECK-SVE-UF2-SAME: ptr noalias writeonly captures(none) [[DST:%.*]], ptr readonly captures(none) [[SRC:%.*]], i32 [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CHECK-SVE-UF2-NEXT: entry: +; CHECK-SVE-UF2-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-SVE-UF2-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK-SVE-UF2: for.body.preheader: +; CHECK-SVE-UF2-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64 +; CHECK-SVE-UF2-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE-UF2-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 +; CHECK-SVE-UF2-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]]) +; CHECK-SVE-UF2-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 0) +; CHECK-SVE-UF2-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 2) +; CHECK-SVE-UF2-NEXT: [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE-UF2-NEXT: [[DOTIDX:%.*]] = shl nuw nsw i64 [[TMP4]], 4 +; CHECK-SVE-UF2-NEXT: [[TMP5:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE-UF2-NEXT: [[DOTIDX5:%.*]] = shl nuw nsw i64 [[TMP5]], 4 +; CHECK-SVE-UF2-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-SVE-UF2: vector.body: +; CHECK-SVE-UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-UF2-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[TMP2]], [[FOR_BODY_PREHEADER]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-UF2-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[TMP3]], [[FOR_BODY_PREHEADER]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-UF2-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw double, ptr [[SRC]], i64 [[INDEX]] +; CHECK-SVE-UF2-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP6]], i64 [[DOTIDX]] +; CHECK-SVE-UF2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = tail call @llvm.masked.load.nxv2f64.p0(ptr [[TMP6]], i32 8, [[ACTIVE_LANE_MASK]], poison) +; CHECK-SVE-UF2-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = tail call @llvm.masked.load.nxv2f64.p0(ptr nonnull [[TMP7]], i32 8, [[ACTIVE_LANE_MASK2]], poison) +; CHECK-SVE-UF2-NEXT: [[TMP8:%.*]] = fmul [[WIDE_MASKED_LOAD]], splat (double 3.000000e+00) +; CHECK-SVE-UF2-NEXT: [[TMP9:%.*]] = fmul [[WIDE_MASKED_LOAD3]], splat (double 3.000000e+00) +; CHECK-SVE-UF2-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw double, ptr [[DST]], i64 [[INDEX]] +; CHECK-SVE-UF2-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP10]], i64 [[DOTIDX5]] +; CHECK-SVE-UF2-NEXT: tail call void @llvm.masked.store.nxv2f64.p0( [[TMP8]], ptr [[TMP10]], i32 8, [[ACTIVE_LANE_MASK]]) +; CHECK-SVE-UF2-NEXT: tail call void @llvm.masked.store.nxv2f64.p0( [[TMP9]], ptr [[TMP11]], i32 8, [[ACTIVE_LANE_MASK2]]) +; CHECK-SVE-UF2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] +; CHECK-SVE-UF2-NEXT: [[ACTIVE_LANE_MASK_NEXT:%.*]] = tail call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]]) +; CHECK-SVE-UF2-NEXT: [[TMP12]] = tail call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_NEXT]], i64 0) +; CHECK-SVE-UF2-NEXT: [[TMP13]] = tail call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_NEXT]], i64 2) +; CHECK-SVE-UF2-NEXT: [[TMP14:%.*]] = extractelement [[TMP12]], i64 0 +; CHECK-SVE-UF2-NEXT: br i1 [[TMP14]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-SVE-UF2: for.end: +; CHECK-SVE-UF2-NEXT: ret void +; +; CHECK-SVE-UF4-LABEL: define void @scalable_wide_active_lane_mask_float( +; CHECK-SVE-UF4-SAME: ptr noalias writeonly captures(none) [[DST:%.*]], ptr readonly captures(none) [[SRC:%.*]], i32 [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CHECK-SVE-UF4-NEXT: entry: +; CHECK-SVE-UF4-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-SVE-UF4-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK-SVE-UF4: for.body.preheader: +; CHECK-SVE-UF4-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64 +; CHECK-SVE-UF4-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE-UF4-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 3 +; CHECK-SVE-UF4-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]]) +; CHECK-SVE-UF4-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i1.nxv8i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 0) +; CHECK-SVE-UF4-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv2i1.nxv8i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 2) +; CHECK-SVE-UF4-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv2i1.nxv8i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 4) +; CHECK-SVE-UF4-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv2i1.nxv8i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 6) +; CHECK-SVE-UF4-NEXT: [[TMP6:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE-UF4-NEXT: [[DOTIDX:%.*]] = shl nuw nsw i64 [[TMP6]], 4 +; CHECK-SVE-UF4-NEXT: [[TMP7:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE-UF4-NEXT: [[DOTIDX15:%.*]] = shl nuw nsw i64 [[TMP7]], 5 +; CHECK-SVE-UF4-NEXT: [[TMP8:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE-UF4-NEXT: [[DOTIDX16:%.*]] = mul nuw nsw i64 [[TMP8]], 48 +; CHECK-SVE-UF4-NEXT: [[TMP9:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE-UF4-NEXT: [[DOTIDX17:%.*]] = shl nuw nsw i64 [[TMP9]], 4 +; CHECK-SVE-UF4-NEXT: [[TMP10:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE-UF4-NEXT: [[DOTIDX18:%.*]] = shl nuw nsw i64 [[TMP10]], 5 +; CHECK-SVE-UF4-NEXT: [[TMP11:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-SVE-UF4-NEXT: [[DOTIDX19:%.*]] = mul nuw nsw i64 [[TMP11]], 48 +; CHECK-SVE-UF4-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-SVE-UF4: vector.body: +; CHECK-SVE-UF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-UF4-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[TMP2]], [[FOR_BODY_PREHEADER]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-UF4-NEXT: [[ACTIVE_LANE_MASK6:%.*]] = phi [ [[TMP3]], [[FOR_BODY_PREHEADER]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-UF4-NEXT: [[ACTIVE_LANE_MASK7:%.*]] = phi [ [[TMP4]], [[FOR_BODY_PREHEADER]] ], [ [[TMP26:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-UF4-NEXT: [[ACTIVE_LANE_MASK8:%.*]] = phi [ [[TMP5]], [[FOR_BODY_PREHEADER]] ], [ [[TMP27:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-UF4-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw double, ptr [[SRC]], i64 [[INDEX]] +; CHECK-SVE-UF4-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP12]], i64 [[DOTIDX]] +; CHECK-SVE-UF4-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP12]], i64 [[DOTIDX15]] +; CHECK-SVE-UF4-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP12]], i64 [[DOTIDX16]] +; CHECK-SVE-UF4-NEXT: [[WIDE_MASKED_LOAD:%.*]] = tail call @llvm.masked.load.nxv2f64.p0(ptr [[TMP12]], i32 8, [[ACTIVE_LANE_MASK]], poison) +; CHECK-SVE-UF4-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = tail call @llvm.masked.load.nxv2f64.p0(ptr nonnull [[TMP13]], i32 8, [[ACTIVE_LANE_MASK6]], poison) +; CHECK-SVE-UF4-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = tail call @llvm.masked.load.nxv2f64.p0(ptr nonnull [[TMP14]], i32 8, [[ACTIVE_LANE_MASK7]], poison) +; CHECK-SVE-UF4-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = tail call @llvm.masked.load.nxv2f64.p0(ptr nonnull [[TMP15]], i32 8, [[ACTIVE_LANE_MASK8]], poison) +; CHECK-SVE-UF4-NEXT: [[TMP16:%.*]] = fmul [[WIDE_MASKED_LOAD]], splat (double 3.000000e+00) +; CHECK-SVE-UF4-NEXT: [[TMP17:%.*]] = fmul [[WIDE_MASKED_LOAD9]], splat (double 3.000000e+00) +; CHECK-SVE-UF4-NEXT: [[TMP18:%.*]] = fmul [[WIDE_MASKED_LOAD10]], splat (double 3.000000e+00) +; CHECK-SVE-UF4-NEXT: [[TMP19:%.*]] = fmul [[WIDE_MASKED_LOAD11]], splat (double 3.000000e+00) +; CHECK-SVE-UF4-NEXT: [[TMP20:%.*]] = getelementptr inbounds nuw double, ptr [[DST]], i64 [[INDEX]] +; CHECK-SVE-UF4-NEXT: [[TMP21:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP20]], i64 [[DOTIDX17]] +; CHECK-SVE-UF4-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP20]], i64 [[DOTIDX18]] +; CHECK-SVE-UF4-NEXT: [[TMP23:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP20]], i64 [[DOTIDX19]] +; CHECK-SVE-UF4-NEXT: tail call void @llvm.masked.store.nxv2f64.p0( [[TMP16]], ptr [[TMP20]], i32 8, [[ACTIVE_LANE_MASK]]) +; CHECK-SVE-UF4-NEXT: tail call void @llvm.masked.store.nxv2f64.p0( [[TMP17]], ptr [[TMP21]], i32 8, [[ACTIVE_LANE_MASK6]]) +; CHECK-SVE-UF4-NEXT: tail call void @llvm.masked.store.nxv2f64.p0( [[TMP18]], ptr [[TMP22]], i32 8, [[ACTIVE_LANE_MASK7]]) +; CHECK-SVE-UF4-NEXT: tail call void @llvm.masked.store.nxv2f64.p0( [[TMP19]], ptr [[TMP23]], i32 8, [[ACTIVE_LANE_MASK8]]) +; CHECK-SVE-UF4-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] +; CHECK-SVE-UF4-NEXT: [[ACTIVE_LANE_MASK_NEXT:%.*]] = tail call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]]) +; CHECK-SVE-UF4-NEXT: [[TMP24]] = tail call @llvm.vector.extract.nxv2i1.nxv8i1( [[ACTIVE_LANE_MASK_NEXT]], i64 0) +; CHECK-SVE-UF4-NEXT: [[TMP25]] = tail call @llvm.vector.extract.nxv2i1.nxv8i1( [[ACTIVE_LANE_MASK_NEXT]], i64 2) +; CHECK-SVE-UF4-NEXT: [[TMP26]] = tail call @llvm.vector.extract.nxv2i1.nxv8i1( [[ACTIVE_LANE_MASK_NEXT]], i64 4) +; CHECK-SVE-UF4-NEXT: [[TMP27]] = tail call @llvm.vector.extract.nxv2i1.nxv8i1( [[ACTIVE_LANE_MASK_NEXT]], i64 6) +; CHECK-SVE-UF4-NEXT: [[TMP28:%.*]] = extractelement [[TMP24]], i64 0 +; CHECK-SVE-UF4-NEXT: br i1 [[TMP28]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-SVE-UF4: for.end: +; CHECK-SVE-UF4-NEXT: ret void +; +entry: + %cmp6 = icmp sgt i32 %n, 0 + br i1 %cmp6, label %for.body.preheader, label %for.end + +for.body.preheader: + %wide.trip.count = zext nneg i32 %n to i64 + br label %for.body + +for.body: + %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.body ] + %arrayidx1 = getelementptr inbounds double, ptr %src, i64 %iv + %ld = load double, ptr %arrayidx1 + %mul = fmul double %ld, 3.000000e+00 + %arrayidx2 = getelementptr inbounds double, ptr %dst, i64 %iv + store double %mul, ptr %arrayidx2 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: + ret void +} + +attributes #0 = { nounwind vscale_range(1,16) "target-features"="+sve2p1" } + +;. +; CHECK-SVE-UF0: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK-SVE-UF0: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-SVE-UF0: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK-SVE-UF0: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]} +;. +; CHECK-SVE-UF2: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK-SVE-UF2: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-SVE-UF2: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK-SVE-UF2: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]} +;. +; CHECK-SVE-UF4: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK-SVE-UF4: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-SVE-UF4: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK-SVE-UF4: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]} +;. diff --git a/llvm/test/Transforms/LoopVectorize/ARM/active-lane-mask.ll b/llvm/test/Transforms/LoopVectorize/ARM/active-lane-mask.ll new file mode 100644 index 0000000000000..d20216654c3b4 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/ARM/active-lane-mask.ll @@ -0,0 +1,88 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -passes=loop-vectorize -tail-predication=enabled -prefer-predicate-over-epilogue=predicate-dont-vectorize -enable-wide-lane-mask -S | FileCheck %s + +target triple = "thumbv8.1m.main-arm-unknown-eabihf" + +define void @f0(ptr noalias %dst, ptr readonly %src, i64 %n) #0 { +; CHECK-LABEL: define void @f0( +; CHECK-SAME: ptr noalias [[DST:%.*]], ptr readonly [[SRC:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[VAL:%.*]] = icmp sgt i64 [[N]], 0 +; CHECK-NEXT: br i1 [[VAL]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_END:.*]] +; CHECK: [[FOR_BODY_PREHEADER]]: +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], 31 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 32 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 16 +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 [[INDEX]], i64 [[N]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK1:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 [[TMP0]], i64 [[N]]) +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP2]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) +; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP3]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK1]], <16 x i8> poison) +; CHECK-NEXT: [[TMP4:%.*]] = mul <16 x i8> [[WIDE_MASKED_LOAD]], splat (i8 3) +; CHECK-NEXT: [[TMP5:%.*]] = mul <16 x i8> [[WIDE_MASKED_LOAD2]], splat (i8 3) +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 16 +; CHECK-NEXT: call void @llvm.masked.store.v16i8.p0(<16 x i8> [[TMP4]], ptr [[TMP7]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: call void @llvm.masked.store.v16i8.p0(<16 x i8> [[TMP5]], ptr [[TMP8]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK1]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32 +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br label %[[FOR_END_LOOPEXIT:.*]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[MUL:%.*]] = mul i8 [[TMP10]], 3 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store i8 [[MUL]], ptr [[ARRAYIDX3]], align 1 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: [[FOR_END_LOOPEXIT]]: +; CHECK-NEXT: br label %[[FOR_END]] +; CHECK: [[FOR_END]]: +; CHECK-NEXT: ret void +; +entry: + %val = icmp sgt i64 %n, 0 + br i1 %val, label %for.body, label %for.end + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i8, ptr %src, i64 %indvars.iv + %0 = load i8, ptr %arrayidx, align 1 + %mul = mul i8 %0, 3 + %arrayidx3 = getelementptr inbounds i8, ptr %dst, i64 %indvars.iv + store i8 %mul, ptr %arrayidx3, align 1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !1 + +for.end: + ret void +} + +attributes #0 = { nofree norecurse nounwind "target-features"="+armv8.1-m.main,+mve.fp" } + +!1 = distinct !{!1, !2, !3} +!2 = !{!"llvm.loop.vectorize.width", i32 16} +!3 = !{!"llvm.loop.interleave.count", i32 2} +;. +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +;.