From 7bba233ca9cb8cd4dc333e85dee115fb06137d40 Mon Sep 17 00:00:00 2001
From: Kerry McLaughlin <kerry.mclaughlin@arm.com>
Date: Thu, 20 Mar 2025 16:08:26 +0000
Subject: [PATCH] [LoopVectorize] Generate wide active lane masks

This patch adds a new flag (-enable-wide-lane-mask) which allows
LoopVectorize to generate wider-than-VF active lane masks when it
is safe to do so (i.e. using tail folding without runtime checks).

The transform in extractFromWideActiveLaneMask creates vector
extracts from the first active lane mask in the header & loop body,
modifying the active lane mask phi operands to use the extracts.

An additional operand is passed to the ActiveLaneMask instruction, the
value of which is used as a multiplier of VF when generating the mask. By
default this is 1, and is updated to UF by extractFromWideActiveLaneMask.

The motivation for this change is to improve interleaved loops when
SVE2.1 is available, where we can make use of the whilelo instruction
which returns a predicate pair.

This is based on a PR that was created by @momchil-velikov (#81140)
and contains tests which were added there.
---
 llvm/lib/Analysis/VectorUtils.cpp             |   2 +
 .../Vectorize/LoopVectorizationPlanner.h      |   1 +
 .../Transforms/Vectorize/LoopVectorize.cpp    |   9 +-
 llvm/lib/Transforms/Vectorize/VPlan.h         |  17 +-
 .../Transforms/Vectorize/VPlanAnalysis.cpp    |   1 +
 .../Transforms/Vectorize/VPlanPatternMatch.h  |   9 +-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  19 +-
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 119 +++++--
 .../Transforms/Vectorize/VPlanTransforms.h    |   3 +-
 llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp |   1 +
 llvm/lib/Transforms/Vectorize/VPlanUtils.cpp  |   2 +-
 .../CodeGen/AArch64/sve-wide-lane-mask.ll     | 121 +++++++
 .../AArch64/fixed-wide-lane-mask.ll           | 129 +++++++
 .../AArch64/sve-wide-lane-mask.ll             | 335 ++++++++++++++++++
 .../LoopVectorize/ARM/active-lane-mask.ll     |  88 +++++
 15 files changed, 824 insertions(+), 32 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/sve-wide-lane-mask.ll
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/fixed-wide-lane-mask.ll
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/sve-wide-lane-mask.ll
 create mode 100644 llvm/test/Transforms/LoopVectorize/ARM/active-lane-mask.ll
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 63fccee63c0ae..1dff9c3513a28 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -163,6 +163,7 @@ bool llvm::isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID,
   case Intrinsic::is_fpclass:
   case Intrinsic::vp_is_fpclass:
   case Intrinsic::powi:
+  case Intrinsic::vector_extract:
     return (ScalarOpdIdx == 1);
   case Intrinsic::smul_fix:
   case Intrinsic::smul_fix_sat:
@@ -195,6 +196,7 @@ bool llvm::isVectorIntrinsicWithOverloadTypeAtArg(
   case Intrinsic::vp_llrint:
   case Intrinsic::ucmp:
   case Intrinsic::scmp:
+  case Intrinsic::vector_extract:
     return OpdIdx == -1 || OpdIdx == 0;
   case Intrinsic::modf:
   case Intrinsic::sincos:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 144f35e10132f..dd54d964f8883 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -44,6 +44,7 @@ class VPRecipeBuilder;
 struct VFRange;
 
 extern cl::opt<bool> EnableVPlanNativePath;
+extern cl::opt<bool> EnableWideActiveLaneMask;
 extern cl::opt<unsigned> ForceTargetInstructionCost;
 
 /// VPlan-based builder utility analogous to IRBuilder.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index e7bae17dd2ceb..6e5f4caf93d23 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -356,6 +356,10 @@ cl::opt<bool> llvm::EnableVPlanNativePath(
     cl::desc("Enable VPlan-native vectorization path with "
              "support for outer loop vectorization."));
 
+cl::opt<bool> llvm::EnableWideActiveLaneMask(
+    "enable-wide-lane-mask", cl::init(false), cl::Hidden,
+    cl::desc("Enable use of wide get active lane mask instructions"));
+
 cl::opt<bool>
     llvm::VerifyEachVPlan("vplan-verify-each",
 #ifdef EXPENSIVE_CHECKS
@@ -7328,7 +7332,10 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
     VPlanTransforms::runPass(VPlanTransforms::addBranchWeightToMiddleTerminator,
                              BestVPlan, BestVF, VScale);
   }
-  VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
+  VPlanTransforms::optimizeForVFAndUF(
+      BestVPlan, BestVF, BestUF, PSE,
+      ILV.Cost->getTailFoldingStyle() ==
+          TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck);
   VPlanTransforms::simplifyRecipes(BestVPlan, *Legal->getWidestInductionType());
   VPlanTransforms::narrowInterleaveGroups(
       BestVPlan, BestVF,
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 356af4a0e74e4..6080aa88ec306 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -954,6 +954,9 @@ class VPInstruction : public VPRecipeWithIRFlags,
     // part if it is scalar. In the latter case, the recipe will be removed
     // during unrolling.
     ExtractPenultimateElement,
+    // Extracts a subvector from a vector (first operand) starting at a given
+    // offset (second operand).
+    ExtractSubvector,
     LogicalAnd, // Non-poison propagating logical And.
     // Add an offset in bytes (second operand) to a base pointer (first
     // operand). Only generates scalar values (either for the first lane only or
@@ -1887,6 +1890,9 @@ class VPHeaderPHIRecipe : public VPSingleDefRecipe, public VPPhiAccessors {
     return getOperand(1);
   }
 
+  // Update the incoming value from the loop backedge.
+  void setBackedgeValue(VPValue *V) { setOperand(1, V); }
+
   /// Returns the backedge value as a recipe. The backedge value is guaranteed
   /// to be a recipe.
   virtual VPRecipeBase &getBackedgeRecipe() {
@@ -3234,10 +3240,12 @@ class VPCanonicalIVPHIRecipe : public VPHeaderPHIRecipe {
 /// TODO: It would be good to use the existing VPWidenPHIRecipe instead and
 /// remove VPActiveLaneMaskPHIRecipe.
 class VPActiveLaneMaskPHIRecipe : public VPHeaderPHIRecipe {
+  unsigned UnrollPart = 0;
+
 public:
-  VPActiveLaneMaskPHIRecipe(VPValue *StartMask, DebugLoc DL)
-      : VPHeaderPHIRecipe(VPDef::VPActiveLaneMaskPHISC, nullptr, StartMask,
-                          DL) {}
+  VPActiveLaneMaskPHIRecipe(VPValue *StartMask, DebugLoc DL, unsigned Part = 0)
+      : VPHeaderPHIRecipe(VPDef::VPActiveLaneMaskPHISC, nullptr, StartMask, DL),
+        UnrollPart(Part) {}
 
   ~VPActiveLaneMaskPHIRecipe() override = default;
 
@@ -3250,6 +3258,9 @@ class VPActiveLaneMaskPHIRecipe : public VPHeaderPHIRecipe {
 
   VP_CLASSOF_IMPL(VPDef::VPActiveLaneMaskPHISC)
 
+  unsigned getUnrollPart() { return UnrollPart; }
+  void setUnrollPart(unsigned Part) { UnrollPart = Part; }
+
   /// Generate the active lane mask phi of the vector loop.
   void execute(VPTransformState &State) override;
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 92db9674ef42b..5e7f797b70978 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -74,6 +74,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
   switch (Opcode) {
   case Instruction::ExtractElement:
   case Instruction::Freeze:
+  case VPInstruction::ExtractSubvector:
   case VPInstruction::ReductionStartVector:
     return inferScalarType(R->getOperand(0));
   case Instruction::Select: {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
index efea99f22d086..62898bf2c1991 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
@@ -384,10 +384,11 @@ m_Broadcast(const Op0_t &Op0) {
   return m_VPInstruction<VPInstruction::Broadcast>(Op0);
 }
 
-template <typename Op0_t, typename Op1_t>
-inline BinaryVPInstruction_match<Op0_t, Op1_t, VPInstruction::ActiveLaneMask>
-m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1) {
-  return m_VPInstruction<VPInstruction::ActiveLaneMask>(Op0, Op1);
+template <typename Op0_t, typename Op1_t, typename Op2_t>
+inline TernaryVPInstruction_match<Op0_t, Op1_t, Op2_t,
+                                  VPInstruction::ActiveLaneMask>
+m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2) {
+  return m_VPInstruction<VPInstruction::ActiveLaneMask>(Op0, Op1, Op2);
 }
 
 template <typename Op0_t, typename Op1_t>
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index ccb7512051d77..c776d5cb91278 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -469,15 +469,16 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
   case Instruction::ICmp:
   case Instruction::FCmp:
   case Instruction::Store:
-  case VPInstruction::ActiveLaneMask:
   case VPInstruction::BranchOnCount:
   case VPInstruction::ComputeReductionResult:
+  case VPInstruction::ExtractSubvector:
   case VPInstruction::FirstOrderRecurrenceSplice:
   case VPInstruction::LogicalAnd:
   case VPInstruction::PtrAdd:
   case VPInstruction::WideIVStep:
     return 2;
   case Instruction::Select:
+  case VPInstruction::ActiveLaneMask:
   case VPInstruction::ComputeAnyOfResult:
   case VPInstruction::ReductionStartVector:
     return 3;
@@ -614,7 +615,9 @@ Value *VPInstruction::generate(VPTransformState &State) {
                                Name);
 
     auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
-    auto *PredTy = VectorType::get(Int1Ty, State.VF);
+    auto PredTy = VectorType::get(
+        Int1Ty, State.VF * cast<ConstantInt>(getOperand(2)->getLiveInIRValue())
+                               ->getZExtValue());
     return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
                                    {PredTy, ScalarTC->getType()},
                                    {VIVElem0, ScalarTC}, nullptr, Name);
@@ -846,6 +849,14 @@ Value *VPInstruction::generate(VPTransformState &State) {
       Res->setName(Name);
     return Res;
   }
+  case VPInstruction::ExtractSubvector: {
+    Value *Vec = State.get(getOperand(0));
+    assert(State.VF.isVector());
+    auto Idx = cast<ConstantInt>(getOperand(1)->getLiveInIRValue());
+    auto ResTy = VectorType::get(
+        State.TypeAnalysis.inferScalarType(getOperand(0)), State.VF);
+    return Builder.CreateExtractVector(ResTy, Vec, Idx);
+  }
   case VPInstruction::LogicalAnd: {
     Value *A = State.get(getOperand(0));
     Value *B = State.get(getOperand(1));
@@ -1044,6 +1055,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
   case VPInstruction::CanonicalIVIncrementForPart:
   case VPInstruction::ExtractLastElement:
   case VPInstruction::ExtractPenultimateElement:
+  case VPInstruction::ExtractSubvector:
   case VPInstruction::FirstActiveLane:
   case VPInstruction::FirstOrderRecurrenceSplice:
   case VPInstruction::LogicalAnd:
@@ -1174,6 +1186,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
   case VPInstruction::ExtractPenultimateElement:
     O << "extract-penultimate-element";
     break;
+  case VPInstruction::ExtractSubvector:
+    O << "extract-subvector";
+    break;
   case VPInstruction::ComputeAnyOfResult:
     O << "compute-anyof-result";
     break;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 90137b72c83fb..b8f14ca88e8a3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "VPlanTransforms.h"
+#include "LoopVectorizationPlanner.h"
 #include "VPRecipeBuilder.h"
 #include "VPlan.h"
 #include "VPlanAnalysis.h"
@@ -1432,20 +1433,93 @@ static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan,
   return SE.isKnownPredicate(CmpInst::ICMP_EQ, TripCount, C);
 }
 
+static void extractFromWideActiveLaneMask(VPlan &Plan, ElementCount VF,
+                                          unsigned UF) {
+  VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
+  auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
+  VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
+  auto *Term = &ExitingVPBB->back();
+
+  VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
+  LLVMContext &Ctx = CanonicalIV->getScalarType()->getContext();
+  using namespace llvm::VPlanPatternMatch;
+
+  auto extractFromALM = [&](VPInstruction *ALM, VPInstruction *InsBefore,
+                            SmallVectorImpl<VPValue *> &Extracts) {
+    VPBuilder Builder(InsBefore);
+    DebugLoc DL = ALM->getDebugLoc();
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      SmallVector<VPValue *> Ops;
+      Ops.append({ALM, Plan.getOrAddLiveIn(
+                           ConstantInt::get(IntegerType::getInt64Ty(Ctx),
+                                            VF.getKnownMinValue() * Part))});
+      Extracts.push_back(
+          Builder.createNaryOp(VPInstruction::ExtractSubvector, Ops, DL));
+    }
+  };
+
+  // Create a list of each active lane mask phi, ordered by unroll part.
+  SmallVector<VPActiveLaneMaskPHIRecipe *> Phis(UF, nullptr);
+  for (VPRecipeBase &R : Header->phis())
+    if (auto *Phi = dyn_cast<VPActiveLaneMaskPHIRecipe>(&R))
+      Phis[Phi->getUnrollPart()] = Phi;
+
+  assert(all_of(Phis, [](VPActiveLaneMaskPHIRecipe *Phi) { return Phi; }) &&
+         "Expected one VPActiveLaneMaskPHIRecipe for each unroll part");
+
+  // When using wide lane masks, the return type of the get.active.lane.mask
+  // intrinsic is VF x UF (second operand).
+  VPValue *ALMMultiplier =
+      Plan.getOrAddLiveIn(ConstantInt::get(IntegerType::getInt64Ty(Ctx), UF));
+  cast<VPInstruction>(Phis[0]->getStartValue())->setOperand(2, ALMMultiplier);
+  cast<VPInstruction>(Phis[0]->getBackedgeValue())
+      ->setOperand(2, ALMMultiplier);
+
+  // Create UF x extract vectors and insert into preheader.
+  SmallVector<VPValue *> EntryExtracts;
+  auto *EntryALM = cast<VPInstruction>(Phis[0]->getStartValue());
+  extractFromALM(EntryALM, cast<VPInstruction>(&EntryALM->getParent()->back()),
+                 EntryExtracts);
+
+  // Create UF x extract vectors and insert before the loop compare & branch,
+  // updating the compare to use the first extract.
+  SmallVector<VPValue *> LoopExtracts;
+  auto *LoopALM = cast<VPInstruction>(Phis[0]->getBackedgeValue());
+  VPInstruction *Not = cast<VPInstruction>(Term->getOperand(0));
+  extractFromALM(LoopALM, Not, LoopExtracts);
+  Not->setOperand(0, LoopExtracts[0]);
+
+  // Update the incoming values of active lane mask phis.
+  for (unsigned Part = 0; Part < UF; ++Part) {
+    Phis[Part]->setStartValue(EntryExtracts[Part]);
+    Phis[Part]->setBackedgeValue(LoopExtracts[Part]);
+  }
+
+  return;
+}
+
 /// Try to simplify the branch condition of \p Plan. This may restrict the
 /// resulting plan to \p BestVF and \p BestUF.
-static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF,
-                                              unsigned BestUF,
-                                              PredicatedScalarEvolution &PSE) {
+static bool
+simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF,
+                                  unsigned BestUF,
+                                  PredicatedScalarEvolution &PSE,
+                                  bool DataAndControlFlowWithoutRuntimeCheck) {
   VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
   VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
   auto *Term = &ExitingVPBB->back();
   VPValue *Cond;
   ScalarEvolution &SE = *PSE.getSE();
   using namespace llvm::VPlanPatternMatch;
-  if (match(Term, m_BranchOnCount(m_VPValue(), m_VPValue())) ||
-      match(Term, m_BranchOnCond(
-                      m_Not(m_ActiveLaneMask(m_VPValue(), m_VPValue()))))) {
+  auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
+  bool BranchALM = match(Term, m_BranchOnCond(m_Not(m_ActiveLaneMask(
+                                   m_VPValue(), m_VPValue(), m_VPValue()))));
+
+  if (BranchALM || match(Term, m_BranchOnCount(m_VPValue(), m_VPValue()))) {
+    if (BranchALM && DataAndControlFlowWithoutRuntimeCheck &&
+        EnableWideActiveLaneMask && BestVF.isVector() && BestUF > 1)
+      extractFromWideActiveLaneMask(Plan, BestVF, BestUF);
+
     // Try to simplify the branch condition if TC <= VF * UF when the latch
     // terminator is   BranchOnCount or BranchOnCond where the input is
     // Not(ActiveLaneMask).
@@ -1470,7 +1544,6 @@ static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF,
   // The vector loop region only executes once. If possible, completely remove
   // the region, otherwise replace the terminator controlling the latch with
   // (BranchOnCond true).
-  auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
   auto *CanIVTy = Plan.getCanonicalIV()->getScalarType();
   if (all_of(
           Header->phis(),
@@ -1507,14 +1580,15 @@ static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF,
   return true;
 }
 
-void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
-                                         unsigned BestUF,
-                                         PredicatedScalarEvolution &PSE) {
+void VPlanTransforms::optimizeForVFAndUF(
+    VPlan &Plan, ElementCount BestVF, unsigned BestUF,
+    PredicatedScalarEvolution &PSE,
+    bool DataAndControlFlowWithoutRuntimeCheck) {
   assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
   assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
 
-  bool MadeChange =
-      simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
+  bool MadeChange = simplifyBranchConditionForVFAndUF(
+      Plan, BestVF, BestUF, PSE, DataAndControlFlowWithoutRuntimeCheck);
   MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF);
 
   if (MadeChange) {
@@ -2006,9 +2080,11 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
       "index.part.next");
 
   // Create the active lane mask instruction in the VPlan preheader.
-  auto *EntryALM =
-      Builder.createNaryOp(VPInstruction::ActiveLaneMask, {EntryIncrement, TC},
-                           DL, "active.lane.mask.entry");
+  VPValue *ALMMultiplier = Plan.getOrAddLiveIn(
+      ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1));
+  auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
+                                        {EntryIncrement, TC, ALMMultiplier}, DL,
+                                        "active.lane.mask.entry");
 
   // Now create the ActiveLaneMaskPhi recipe in the main loop using the
   // preheader ActiveLaneMask instruction.
@@ -2023,8 +2099,8 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
       Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart,
                                   {IncrementValue}, {false, false}, DL);
   auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
-                                   {InLoopIncrement, TripCount}, DL,
-                                   "active.lane.mask.next");
+                                   {InLoopIncrement, TripCount, ALMMultiplier},
+                                   DL, "active.lane.mask.next");
   LaneMaskPhi->addOperand(ALM);
 
   // Replace the original terminator with BranchOnCond. We have to invert the
@@ -2101,9 +2177,12 @@ void VPlanTransforms::addActiveLaneMask(
         Plan, DataAndControlFlowWithoutRuntimeCheck);
   } else {
     VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
-    LaneMask = B.createNaryOp(VPInstruction::ActiveLaneMask,
-                              {WideCanonicalIV, Plan.getTripCount()}, nullptr,
-                              "active.lane.mask");
+    VPValue *ALMMultiplier = Plan.getOrAddLiveIn(
+        ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1));
+    LaneMask =
+        B.createNaryOp(VPInstruction::ActiveLaneMask,
+                       {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier},
+                       nullptr, "active.lane.mask");
   }
 
   // Walk users of WideCanonicalIV and replace all compares of the form
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 8d2eded45da22..920c7aa32cc97 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -109,7 +109,8 @@ struct VPlanTransforms {
   /// resulting plan to \p BestVF and \p BestUF.
   static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
                                  unsigned BestUF,
-                                 PredicatedScalarEvolution &PSE);
+                                 PredicatedScalarEvolution &PSE,
+                                 bool DataAndControlFlowWithoutRuntimeCheck);
 
   /// Apply VPlan-to-VPlan optimizations to \p Plan, including induction recipe
   /// optimizations, dead recipe removal, replicate region optimizations and
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index 2dd43c092ff7a..76a37d5ba839b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -250,6 +250,7 @@ void UnrollState::unrollHeaderPHIByUF(VPHeaderPHIRecipe *R,
     } else {
       assert(isa<VPActiveLaneMaskPHIRecipe>(R) &&
              "unexpected header phi recipe not needing unrolled part");
+      cast<VPActiveLaneMaskPHIRecipe>(Copy)->setUnrollPart(Part);
     }
   }
 }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
index 81bd21bb904c0..9fdc199fc1dfa 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -61,7 +61,7 @@ bool vputils::isHeaderMask(const VPValue *V, VPlan &Plan) {
   VPValue *A, *B;
   using namespace VPlanPatternMatch;
 
-  if (match(V, m_ActiveLaneMask(m_VPValue(A), m_VPValue(B))))
+  if (match(V, m_ActiveLaneMask(m_VPValue(A), m_VPValue(B), m_SpecificInt(1))))
     return B == Plan.getTripCount() &&
            (match(A, m_ScalarIVSteps(m_Specific(Plan.getCanonicalIV()),
                                      m_SpecificInt(1),
diff --git a/llvm/test/CodeGen/AArch64/sve-wide-lane-mask.ll b/llvm/test/CodeGen/AArch64/sve-wide-lane-mask.ll
new file mode 100644
index 0000000000000..d59dbec491467
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-wide-lane-mask.ll
@@ -0,0 +1,121 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mattr=+sve    < %s | FileCheck %s -check-prefix CHECK-SVE
+; RUN: llc -mattr=+sve2p1 < %s | FileCheck %s -check-prefix CHECK-SVE2p1
+
+target triple = "aarch64-unknown-linux"
+
+define void @scalable_wide_active_lane_mask(ptr %dst, ptr %src, i64 %n) #0 {
+; CHECK-SVE-LABEL: scalable_wide_active_lane_mask:
+; CHECK-SVE:       // %bb.0: // %entry
+; CHECK-SVE-NEXT:    cmp x2, #1
+; CHECK-SVE-NEXT:    b.lt .LBB0_3
+; CHECK-SVE-NEXT:  // %bb.1: // %vector.ph
+; CHECK-SVE-NEXT:    rdvl x8, #2
+; CHECK-SVE-NEXT:    rdvl x9, #1
+; CHECK-SVE-NEXT:    mov x11, xzr
+; CHECK-SVE-NEXT:    subs x10, x2, x8
+; CHECK-SVE-NEXT:    csel x10, xzr, x10, lo
+; CHECK-SVE-NEXT:    whilelo p1.b, xzr, x2
+; CHECK-SVE-NEXT:    whilelo p0.b, x9, x2
+; CHECK-SVE-NEXT:  .LBB0_2: // %vector.body
+; CHECK-SVE-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-SVE-NEXT:    add x12, x1, x11
+; CHECK-SVE-NEXT:    ld1b { z0.b }, p1/z, [x1, x11]
+; CHECK-SVE-NEXT:    add x13, x0, x11
+; CHECK-SVE-NEXT:    ld1b { z1.b }, p0/z, [x12, #1, mul vl]
+; CHECK-SVE-NEXT:    adds x12, x11, x9
+; CHECK-SVE-NEXT:    csinv x12, x12, xzr, lo
+; CHECK-SVE-NEXT:    mul z0.b, z0.b, #3
+; CHECK-SVE-NEXT:    mul z1.b, z1.b, #3
+; CHECK-SVE-NEXT:    st1b { z0.b }, p1, [x0, x11]
+; CHECK-SVE-NEXT:    st1b { z1.b }, p0, [x13, #1, mul vl]
+; CHECK-SVE-NEXT:    whilelo p0.b, x12, x10
+; CHECK-SVE-NEXT:    whilelo p1.b, x11, x10
+; CHECK-SVE-NEXT:    add x11, x11, x8
+; CHECK-SVE-NEXT:    b.mi .LBB0_2
+; CHECK-SVE-NEXT:  .LBB0_3: // %for.end
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-SVE2p1-LABEL: scalable_wide_active_lane_mask:
+; CHECK-SVE2p1:       // %bb.0: // %entry
+; CHECK-SVE2p1-NEXT:    cmp x2, #1
+; CHECK-SVE2p1-NEXT:    b.lt .LBB0_3
+; CHECK-SVE2p1-NEXT:  // %bb.1: // %vector.ph
+; CHECK-SVE2p1-NEXT:    rdvl x9, #2
+; CHECK-SVE2p1-NEXT:    mov x8, xzr
+; CHECK-SVE2p1-NEXT:    subs x9, x2, x9
+; CHECK-SVE2p1-NEXT:    csel x9, xzr, x9, lo
+; CHECK-SVE2p1-NEXT:    whilelo { p0.b, p1.b }, xzr, x2
+; CHECK-SVE2p1-NEXT:  .LBB0_2: // %vector.body
+; CHECK-SVE2p1-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-SVE2p1-NEXT:    add x10, x1, x8
+; CHECK-SVE2p1-NEXT:    ld1b { z0.b }, p0/z, [x1, x8]
+; CHECK-SVE2p1-NEXT:    ld1b { z1.b }, p1/z, [x10, #1, mul vl]
+; CHECK-SVE2p1-NEXT:    add x10, x0, x8
+; CHECK-SVE2p1-NEXT:    mul z0.b, z0.b, #3
+; CHECK-SVE2p1-NEXT:    mul z1.b, z1.b, #3
+; CHECK-SVE2p1-NEXT:    st1b { z0.b }, p0, [x0, x8]
+; CHECK-SVE2p1-NEXT:    st1b { z1.b }, p1, [x10, #1, mul vl]
+; CHECK-SVE2p1-NEXT:    whilelo { p0.b, p1.b }, x8, x9
+; CHECK-SVE2p1-NEXT:    incb x8, all, mul #2
+; CHECK-SVE2p1-NEXT:    mov z0.b, p0/z, #1 // =0x1
+; CHECK-SVE2p1-NEXT:    fmov w10, s0
+; CHECK-SVE2p1-NEXT:    tbnz w10, #0, .LBB0_2
+; CHECK-SVE2p1-NEXT:  .LBB0_3: // %for.end
+; CHECK-SVE2p1-NEXT:    ret
+entry:
+  %cmp = icmp sgt i64 %n, 0
+  br i1 %cmp, label %vector.ph, label %for.end
+
+vector.ph:
+  %0 = tail call i64 @llvm.vscale.i64()
+  %1 = shl nuw nsw i64 %0, 5
+  %2 = tail call i64 @llvm.vscale.i64()
+  %3 = shl nuw nsw i64 %2, 5
+  %4 = tail call i64 @llvm.usub.sat.i64(i64 %n, i64 %3)
+  %active.lane.mask.entry = tail call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 0, i64 %n)
+  %5 = tail call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> %active.lane.mask.entry, i64 0)
+  %6 = tail call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> %active.lane.mask.entry, i64 16)
+  %7 = tail call i64 @llvm.vscale.i64()
+  %8 = shl nuw nsw i64 %7, 4
+  %9 = tail call i64 @llvm.vscale.i64()
+  %10 = shl nuw nsw i64 %9, 4
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %active.lane.mask = phi <vscale x 16 x i1> [ %5, %vector.ph ], [ %17, %vector.body ]
+  %active.lane.mask2 = phi <vscale x 16 x i1> [ %6, %vector.ph ], [ %18, %vector.body ]
+  %11 = getelementptr inbounds nuw i8, ptr %src, i64 %index
+  %12 = getelementptr inbounds nuw i8, ptr %11, i64 %8
+  %wide.masked.load = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr %11, i32 1, <vscale x 16 x i1> %active.lane.mask, <vscale x 16 x i8> poison)
+  %wide.masked.load3 = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr nonnull %12, i32 1, <vscale x 16 x i1> %active.lane.mask2, <vscale x 16 x i8> poison)
+  %13 = mul <vscale x 16 x i8> %wide.masked.load, splat (i8 3)
+  %14 = mul <vscale x 16 x i8> %wide.masked.load3, splat (i8 3)
+  %15 = getelementptr inbounds nuw i8, ptr %dst, i64 %index
+  %16 = getelementptr inbounds nuw i8, ptr %15, i64 %10
+  tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %13, ptr %15, i32 1, <vscale x 16 x i1> %active.lane.mask)
+  tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %14, ptr %16, i32 1, <vscale x 16 x i1> %active.lane.mask2)
+  %index.next = add i64 %index, %1
+  %active.lane.mask.next = tail call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 %index, i64 %4)
+  %17 = tail call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> %active.lane.mask.next, i64 0)
+  %18 = tail call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> %active.lane.mask.next, i64 16)
+  %19 = extractelement <vscale x 16 x i1> %17, i64 0
+  br i1 %19, label %vector.body, label %for.end, !llvm.loop !0
+
+for.end:
+  ret void
+}
+
+declare i64 @llvm.vscale.i64()
+declare <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64, i64)
+declare <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1>, i64 immarg)
+declare <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr captures(none), i32 immarg, <vscale x 16 x i1>, <vscale x 16 x i8>)
+declare void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8>, ptr captures(none), i32 immarg, <vscale x 16 x i1>)
+declare i64 @llvm.usub.sat.i64(i64, i64)
+
+attributes #0 = {  vscale_range(1,16) }
+
+!0 = distinct !{!0, !1, !2}
+!1 = !{!"llvm.loop.isvectorized", i32 1}
+!2 = !{!"llvm.loop.unroll.runtime.disable"}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fixed-wide-lane-mask.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fixed-wide-lane-mask.ll
new file mode 100644
index 0000000000000..52128de119b9b
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/fixed-wide-lane-mask.ll
@@ -0,0 +1,129 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S --passes="default<O3>" -scalable-vectorization=off -prefer-predicate-over-epilogue=predicate-dont-vectorize -enable-wide-lane-mask \
+; RUN:    -force-vector-width=4 -force-vector-interleave=0 < %s | FileCheck %s -check-prefix CHECK-UF0
+; RUN: opt -S --passes="default<O3>" -scalable-vectorization=off -prefer-predicate-over-epilogue=predicate-dont-vectorize -enable-wide-lane-mask \
+; RUN:    -force-vector-width=4 -force-vector-interleave=2 < %s | FileCheck %s -check-prefix CHECK-UF2
+; RUN: opt -S --passes="default<O3>" -scalable-vectorization=off -prefer-predicate-over-epilogue=predicate-dont-vectorize -enable-wide-lane-mask \
+; RUN:    -force-vector-width=4 -force-vector-interleave=4 < %s | FileCheck %s -check-prefix CHECK-UF4
+
+target triple = "aarch64-unknown-linux"
+
+define void @fixed_wide_active_lane_mask(ptr noalias %dst, ptr noalias readonly %src, i64 %n) #0 {
+; CHECK-UF0-LABEL: define void @fixed_wide_active_lane_mask(
+; CHECK-UF0-SAME: ptr noalias writeonly captures(none) [[DST:%.*]], ptr noalias readonly captures(none) [[SRC:%.*]], i64 [[N:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-UF0-NEXT:  entry:
+; CHECK-UF0-NEXT:    [[LD:%.*]] = load i32, ptr [[SRC]], align 4
+; CHECK-UF0-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.usub.sat.i64(i64 [[N]], i64 4)
+; CHECK-UF0-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 0, i64 [[N]])
+; CHECK-UF0-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[LD]], i64 0
+; CHECK-UF0-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-UF0-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-UF0:       vector.body:
+; CHECK-UF0-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-UF0-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-UF0-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i32, ptr [[DST]], i64 [[INDEX]]
+; CHECK-UF0-NEXT:    tail call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP2]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-UF0-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-UF0-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = tail call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 [[TMP0]])
+; CHECK-UF0-NEXT:    [[TMP3:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; CHECK-UF0-NEXT:    br i1 [[TMP3]], label [[VECTOR_BODY]], label [[FOR_END:%.*]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-UF0:       for.end:
+; CHECK-UF0-NEXT:    ret void
+;
+; CHECK-UF2-LABEL: define void @fixed_wide_active_lane_mask(
+; CHECK-UF2-SAME: ptr noalias writeonly captures(none) [[DST:%.*]], ptr noalias readonly captures(none) [[SRC:%.*]], i64 [[N:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-UF2-NEXT:  entry:
+; CHECK-UF2-NEXT:    [[LD:%.*]] = load i32, ptr [[SRC]], align 4
+; CHECK-UF2-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.usub.sat.i64(i64 [[N]], i64 8)
+; CHECK-UF2-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 0, i64 [[N]])
+; CHECK-UF2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i1> [[ACTIVE_LANE_MASK_ENTRY]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-UF2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i1> [[ACTIVE_LANE_MASK_ENTRY]], <8 x i1> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-UF2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[LD]], i64 0
+; CHECK-UF2-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-UF2-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-UF2:       vector.body:
+; CHECK-UF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-UF2-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[TMP1]], [[ENTRY]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-UF2-NEXT:    [[ACTIVE_LANE_MASK2:%.*]] = phi <4 x i1> [ [[TMP2]], [[ENTRY]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
+; CHECK-UF2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i32, ptr [[DST]], i64 [[INDEX]]
+; CHECK-UF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP3]], i64 16
+; CHECK-UF2-NEXT:    tail call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-UF2-NEXT:    tail call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP4]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK2]])
+; CHECK-UF2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
+; CHECK-UF2-NEXT:    [[ACTIVE_LANE_MASK_NEXT:%.*]] = tail call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 [[INDEX]], i64 [[TMP0]])
+; CHECK-UF2-NEXT:    [[TMP5]] = shufflevector <8 x i1> [[ACTIVE_LANE_MASK_NEXT]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-UF2-NEXT:    [[TMP6]] = shufflevector <8 x i1> [[ACTIVE_LANE_MASK_NEXT]], <8 x i1> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-UF2-NEXT:    [[TMP7:%.*]] = extractelement <8 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; CHECK-UF2-NEXT:    br i1 [[TMP7]], label [[VECTOR_BODY]], label [[FOR_END:%.*]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-UF2:       for.end:
+; CHECK-UF2-NEXT:    ret void
+;
+; CHECK-UF4-LABEL: define void @fixed_wide_active_lane_mask(
+; CHECK-UF4-SAME: ptr noalias writeonly captures(none) [[DST:%.*]], ptr noalias readonly captures(none) [[SRC:%.*]], i64 [[N:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-UF4-NEXT:  entry:
+; CHECK-UF4-NEXT:    [[LD:%.*]] = load i32, ptr [[SRC]], align 4
+; CHECK-UF4-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.usub.sat.i64(i64 [[N]], i64 16)
+; CHECK-UF4-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 0, i64 [[N]])
+; CHECK-UF4-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i1> [[ACTIVE_LANE_MASK_ENTRY]], <16 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-UF4-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i1> [[ACTIVE_LANE_MASK_ENTRY]], <16 x i1> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-UF4-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i1> [[ACTIVE_LANE_MASK_ENTRY]], <16 x i1> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; CHECK-UF4-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i1> [[ACTIVE_LANE_MASK_ENTRY]], <16 x i1> poison, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; CHECK-UF4-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[LD]], i64 0
+; CHECK-UF4-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-UF4-NEXT:    br label [[VECTOR_BODY1:%.*]]
+; CHECK-UF4:       vector.body:
+; CHECK-UF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY1]] ]
+; CHECK-UF4-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[TMP1]], [[ENTRY]] ], [ [[TMP9:%.*]], [[VECTOR_BODY1]] ]
+; CHECK-UF4-NEXT:    [[ACTIVE_LANE_MASK4:%.*]] = phi <4 x i1> [ [[TMP2]], [[ENTRY]] ], [ [[TMP10:%.*]], [[VECTOR_BODY1]] ]
+; CHECK-UF4-NEXT:    [[ACTIVE_LANE_MASK5:%.*]] = phi <4 x i1> [ [[TMP3]], [[ENTRY]] ], [ [[TMP11:%.*]], [[VECTOR_BODY1]] ]
+; CHECK-UF4-NEXT:    [[ACTIVE_LANE_MASK6:%.*]] = phi <4 x i1> [ [[TMP4]], [[ENTRY]] ], [ [[TMP12:%.*]], [[VECTOR_BODY1]] ]
+; CHECK-UF4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i32, ptr [[DST]], i64 [[INDEX]]
+; CHECK-UF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP5]], i64 16
+; CHECK-UF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP5]], i64 32
+; CHECK-UF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP5]], i64 48
+; CHECK-UF4-NEXT:    tail call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP5]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-UF4-NEXT:    tail call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP6]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK4]])
+; CHECK-UF4-NEXT:    tail call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP7]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK5]])
+; CHECK-UF4-NEXT:    tail call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP8]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK6]])
+; CHECK-UF4-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; CHECK-UF4-NEXT:    [[ACTIVE_LANE_MASK_NEXT:%.*]] = tail call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 [[INDEX]], i64 [[TMP0]])
+; CHECK-UF4-NEXT:    [[TMP9]] = shufflevector <16 x i1> [[ACTIVE_LANE_MASK_NEXT]], <16 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-UF4-NEXT:    [[TMP10]] = shufflevector <16 x i1> [[ACTIVE_LANE_MASK_NEXT]], <16 x i1> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-UF4-NEXT:    [[TMP11]] = shufflevector <16 x i1> [[ACTIVE_LANE_MASK_NEXT]], <16 x i1> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; CHECK-UF4-NEXT:    [[TMP12]] = shufflevector <16 x i1> [[ACTIVE_LANE_MASK_NEXT]], <16 x i1> poison, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; CHECK-UF4-NEXT:    [[TMP13:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; CHECK-UF4-NEXT:    br i1 [[TMP13]], label [[VECTOR_BODY1]], label [[FOR_END:%.*]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-UF4:       for.end:
+; CHECK-UF4-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %ld = load i32, ptr %src
+  %arrayidx = getelementptr inbounds i32, ptr %dst, i64 %iv
+  store i32 %ld, ptr %arrayidx
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %n
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+attributes #0 = { nounwind "target-features"="+neon,+sve" }
+
+;.
+; CHECK-UF0: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK-UF0: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-UF0: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+;.
+; CHECK-UF2: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK-UF2: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-UF2: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+;.
+; CHECK-UF4: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK-UF4: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-UF4: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-wide-lane-mask.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-wide-lane-mask.ll
new file mode 100644
index 0000000000000..790b78f9002d2
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-wide-lane-mask.ll
@@ -0,0 +1,335 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S --passes="default<O3>" -prefer-predicate-over-epilogue=predicate-dont-vectorize -enable-wide-lane-mask -force-vector-interleave=0 < %s | FileCheck %s -check-prefix CHECK-SVE-UF0
+; RUN: opt -S --passes="default<O3>" -prefer-predicate-over-epilogue=predicate-dont-vectorize -enable-wide-lane-mask -force-vector-interleave=2 < %s | FileCheck %s -check-prefix CHECK-SVE-UF2
+; RUN: opt -S --passes="default<O3>" -prefer-predicate-over-epilogue=predicate-dont-vectorize -enable-wide-lane-mask -force-vector-interleave=4 < %s | FileCheck %s -check-prefix CHECK-SVE-UF4
+
+target triple = "aarch64-unknown-linux"
+
+define void @scalable_wide_active_lane_mask(ptr noalias %dst, ptr readonly %src, i64 %n) #0 {
+; CHECK-SVE-UF0-LABEL: define void @scalable_wide_active_lane_mask(
+; CHECK-SVE-UF0-SAME: ptr noalias writeonly captures(none) [[DST:%.*]], ptr readonly captures(none) [[SRC:%.*]], i64 [[N:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-SVE-UF0-NEXT:  entry:
+; CHECK-SVE-UF0-NEXT:    [[CMP7:%.*]] = icmp sgt i64 [[N]], 0
+; CHECK-SVE-UF0-NEXT:    br i1 [[CMP7]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK-SVE-UF0:       vector.ph:
+; CHECK-SVE-UF0-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF0-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+; CHECK-SVE-UF0-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF0-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 4
+; CHECK-SVE-UF0-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP3]])
+; CHECK-SVE-UF0-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[N]])
+; CHECK-SVE-UF0-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-SVE-UF0:       vector.body:
+; CHECK-SVE-UF0-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-UF0-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-UF0-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-SVE-UF0-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP5]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
+; CHECK-SVE-UF0-NEXT:    [[TMP6:%.*]] = mul <vscale x 16 x i8> [[WIDE_MASKED_LOAD]], splat (i8 3)
+; CHECK-SVE-UF0-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[DST]], i64 [[INDEX]]
+; CHECK-SVE-UF0-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP6]], ptr [[TMP7]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-SVE-UF0-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
+; CHECK-SVE-UF0-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP4]])
+; CHECK-SVE-UF0-NEXT:    [[TMP8:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; CHECK-SVE-UF0-NEXT:    br i1 [[TMP8]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-SVE-UF0:       for.end:
+; CHECK-SVE-UF0-NEXT:    ret void
+;
+; CHECK-SVE-UF2-LABEL: define void @scalable_wide_active_lane_mask(
+; CHECK-SVE-UF2-SAME: ptr noalias writeonly captures(none) [[DST:%.*]], ptr readonly captures(none) [[SRC:%.*]], i64 [[N:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-SVE-UF2-NEXT:  entry:
+; CHECK-SVE-UF2-NEXT:    [[CMP7:%.*]] = icmp sgt i64 [[N]], 0
+; CHECK-SVE-UF2-NEXT:    br i1 [[CMP7]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK-SVE-UF2:       vector.ph:
+; CHECK-SVE-UF2-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF2-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 5
+; CHECK-SVE-UF2-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF2-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 5
+; CHECK-SVE-UF2-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP3]])
+; CHECK-SVE-UF2-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 0, i64 [[N]])
+; CHECK-SVE-UF2-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 0)
+; CHECK-SVE-UF2-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 16)
+; CHECK-SVE-UF2-NEXT:    [[TMP7:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF2-NEXT:    [[TMP8:%.*]] = shl nuw nsw i64 [[TMP7]], 4
+; CHECK-SVE-UF2-NEXT:    [[TMP9:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF2-NEXT:    [[TMP10:%.*]] = shl nuw nsw i64 [[TMP9]], 4
+; CHECK-SVE-UF2-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-SVE-UF2:       vector.body:
+; CHECK-SVE-UF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-UF2-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[TMP5]], [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-UF2-NEXT:    [[ACTIVE_LANE_MASK2:%.*]] = phi <vscale x 16 x i1> [ [[TMP6]], [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-UF2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-SVE-UF2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP11]], i64 [[TMP8]]
+; CHECK-SVE-UF2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP11]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
+; CHECK-SVE-UF2-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr nonnull [[TMP12]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 16 x i8> poison)
+; CHECK-SVE-UF2-NEXT:    [[TMP13:%.*]] = mul <vscale x 16 x i8> [[WIDE_MASKED_LOAD]], splat (i8 3)
+; CHECK-SVE-UF2-NEXT:    [[TMP14:%.*]] = mul <vscale x 16 x i8> [[WIDE_MASKED_LOAD3]], splat (i8 3)
+; CHECK-SVE-UF2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw i8, ptr [[DST]], i64 [[INDEX]]
+; CHECK-SVE-UF2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP15]], i64 [[TMP10]]
+; CHECK-SVE-UF2-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP13]], ptr [[TMP15]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-SVE-UF2-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP14]], ptr [[TMP16]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK2]])
+; CHECK-SVE-UF2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
+; CHECK-SVE-UF2-NEXT:    [[ACTIVE_LANE_MASK_NEXT:%.*]] = tail call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 [[INDEX]], i64 [[TMP4]])
+; CHECK-SVE-UF2-NEXT:    [[TMP17]] = tail call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0)
+; CHECK-SVE-UF2-NEXT:    [[TMP18]] = tail call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 16)
+; CHECK-SVE-UF2-NEXT:    [[TMP19:%.*]] = extractelement <vscale x 16 x i1> [[TMP17]], i64 0
+; CHECK-SVE-UF2-NEXT:    br i1 [[TMP19]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-SVE-UF2:       for.end:
+; CHECK-SVE-UF2-NEXT:    ret void
+;
+; CHECK-SVE-UF4-LABEL: define void @scalable_wide_active_lane_mask(
+; CHECK-SVE-UF4-SAME: ptr noalias writeonly captures(none) [[DST:%.*]], ptr readonly captures(none) [[SRC:%.*]], i64 [[N:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-SVE-UF4-NEXT:  entry:
+; CHECK-SVE-UF4-NEXT:    [[CMP7:%.*]] = icmp sgt i64 [[N]], 0
+; CHECK-SVE-UF4-NEXT:    br i1 [[CMP7]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK-SVE-UF4:       vector.ph:
+; CHECK-SVE-UF4-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF4-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 6
+; CHECK-SVE-UF4-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF4-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 6
+; CHECK-SVE-UF4-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP3]])
+; CHECK-SVE-UF4-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 64 x i1> @llvm.get.active.lane.mask.nxv64i1.i64(i64 0, i64 [[N]])
+; CHECK-SVE-UF4-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv64i1(<vscale x 64 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 0)
+; CHECK-SVE-UF4-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv64i1(<vscale x 64 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 16)
+; CHECK-SVE-UF4-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv64i1(<vscale x 64 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 32)
+; CHECK-SVE-UF4-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv64i1(<vscale x 64 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 48)
+; CHECK-SVE-UF4-NEXT:    [[TMP9:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF4-NEXT:    [[TMP10:%.*]] = shl nuw nsw i64 [[TMP9]], 4
+; CHECK-SVE-UF4-NEXT:    [[TMP11:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF4-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 5
+; CHECK-SVE-UF4-NEXT:    [[TMP13:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF4-NEXT:    [[TMP14:%.*]] = mul nuw nsw i64 [[TMP13]], 48
+; CHECK-SVE-UF4-NEXT:    [[TMP15:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF4-NEXT:    [[TMP16:%.*]] = shl nuw nsw i64 [[TMP15]], 4
+; CHECK-SVE-UF4-NEXT:    [[TMP17:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF4-NEXT:    [[TMP18:%.*]] = shl nuw nsw i64 [[TMP17]], 5
+; CHECK-SVE-UF4-NEXT:    [[TMP19:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF4-NEXT:    [[TMP20:%.*]] = mul nuw nsw i64 [[TMP19]], 48
+; CHECK-SVE-UF4-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-SVE-UF4:       vector.body:
+; CHECK-SVE-UF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-UF4-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[TMP5]], [[VECTOR_PH]] ], [ [[TMP33:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-UF4-NEXT:    [[ACTIVE_LANE_MASK6:%.*]] = phi <vscale x 16 x i1> [ [[TMP6]], [[VECTOR_PH]] ], [ [[TMP34:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-UF4-NEXT:    [[ACTIVE_LANE_MASK7:%.*]] = phi <vscale x 16 x i1> [ [[TMP7]], [[VECTOR_PH]] ], [ [[TMP35:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-UF4-NEXT:    [[ACTIVE_LANE_MASK8:%.*]] = phi <vscale x 16 x i1> [ [[TMP8]], [[VECTOR_PH]] ], [ [[TMP36:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-UF4-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw i8, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-SVE-UF4-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP21]], i64 [[TMP10]]
+; CHECK-SVE-UF4-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP21]], i64 [[TMP12]]
+; CHECK-SVE-UF4-NEXT:    [[TMP24:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP21]], i64 [[TMP14]]
+; CHECK-SVE-UF4-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP21]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
+; CHECK-SVE-UF4-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr nonnull [[TMP22]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 16 x i8> poison)
+; CHECK-SVE-UF4-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr nonnull [[TMP23]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 16 x i8> poison)
+; CHECK-SVE-UF4-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr nonnull [[TMP24]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 16 x i8> poison)
+; CHECK-SVE-UF4-NEXT:    [[TMP25:%.*]] = mul <vscale x 16 x i8> [[WIDE_MASKED_LOAD]], splat (i8 3)
+; CHECK-SVE-UF4-NEXT:    [[TMP26:%.*]] = mul <vscale x 16 x i8> [[WIDE_MASKED_LOAD9]], splat (i8 3)
+; CHECK-SVE-UF4-NEXT:    [[TMP27:%.*]] = mul <vscale x 16 x i8> [[WIDE_MASKED_LOAD10]], splat (i8 3)
+; CHECK-SVE-UF4-NEXT:    [[TMP28:%.*]] = mul <vscale x 16 x i8> [[WIDE_MASKED_LOAD11]], splat (i8 3)
+; CHECK-SVE-UF4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds nuw i8, ptr [[DST]], i64 [[INDEX]]
+; CHECK-SVE-UF4-NEXT:    [[TMP30:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP29]], i64 [[TMP16]]
+; CHECK-SVE-UF4-NEXT:    [[TMP31:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP29]], i64 [[TMP18]]
+; CHECK-SVE-UF4-NEXT:    [[TMP32:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP29]], i64 [[TMP20]]
+; CHECK-SVE-UF4-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP25]], ptr [[TMP29]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-SVE-UF4-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP26]], ptr [[TMP30]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK6]])
+; CHECK-SVE-UF4-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP27]], ptr [[TMP31]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK7]])
+; CHECK-SVE-UF4-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP28]], ptr [[TMP32]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK8]])
+; CHECK-SVE-UF4-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
+; CHECK-SVE-UF4-NEXT:    [[ACTIVE_LANE_MASK_NEXT:%.*]] = tail call <vscale x 64 x i1> @llvm.get.active.lane.mask.nxv64i1.i64(i64 [[INDEX]], i64 [[TMP4]])
+; CHECK-SVE-UF4-NEXT:    [[TMP33]] = tail call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv64i1(<vscale x 64 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0)
+; CHECK-SVE-UF4-NEXT:    [[TMP34]] = tail call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv64i1(<vscale x 64 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 16)
+; CHECK-SVE-UF4-NEXT:    [[TMP35]] = tail call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv64i1(<vscale x 64 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 32)
+; CHECK-SVE-UF4-NEXT:    [[TMP36]] = tail call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv64i1(<vscale x 64 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 48)
+; CHECK-SVE-UF4-NEXT:    [[TMP37:%.*]] = extractelement <vscale x 16 x i1> [[TMP33]], i64 0
+; CHECK-SVE-UF4-NEXT:    br i1 [[TMP37]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-SVE-UF4:       for.end:
+; CHECK-SVE-UF4-NEXT:    ret void
+;
+entry:
+  %cmp = icmp sgt i64 %n, 0
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx1 = getelementptr inbounds i8, ptr %src, i64 %iv
+  %ld = load i8, ptr %arrayidx1
+  %mul = mul i8 %ld, 3
+  %arrayidx2 = getelementptr inbounds i8, ptr %dst, i64 %iv
+  store i8 %mul, ptr %arrayidx2
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %n
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @scalable_wide_active_lane_mask_float(ptr noalias %dst, ptr readonly %src, i32 %n) #0 {
+; CHECK-SVE-UF0-LABEL: define void @scalable_wide_active_lane_mask_float(
+; CHECK-SVE-UF0-SAME: ptr noalias writeonly captures(none) [[DST:%.*]], ptr readonly captures(none) [[SRC:%.*]], i32 [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-SVE-UF0-NEXT:  entry:
+; CHECK-SVE-UF0-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-SVE-UF0-NEXT:    br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK-SVE-UF0:       for.body.preheader:
+; CHECK-SVE-UF0-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
+; CHECK-SVE-UF0-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF0-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 1
+; CHECK-SVE-UF0-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
+; CHECK-SVE-UF0-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-SVE-UF0:       vector.body:
+; CHECK-SVE-UF0-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-UF0-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-UF0-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw double, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-SVE-UF0-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP2]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
+; CHECK-SVE-UF0-NEXT:    [[TMP3:%.*]] = fmul <vscale x 2 x double> [[WIDE_MASKED_LOAD]], splat (double 3.000000e+00)
+; CHECK-SVE-UF0-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw double, ptr [[DST]], i64 [[INDEX]]
+; CHECK-SVE-UF0-NEXT:    tail call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP3]], ptr [[TMP4]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-SVE-UF0-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
+; CHECK-SVE-UF0-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
+; CHECK-SVE-UF0-NEXT:    [[TMP5:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; CHECK-SVE-UF0-NEXT:    br i1 [[TMP5]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-SVE-UF0:       for.end:
+; CHECK-SVE-UF0-NEXT:    ret void
+;
+; CHECK-SVE-UF2-LABEL: define void @scalable_wide_active_lane_mask_float(
+; CHECK-SVE-UF2-SAME: ptr noalias writeonly captures(none) [[DST:%.*]], ptr readonly captures(none) [[SRC:%.*]], i32 [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-SVE-UF2-NEXT:  entry:
+; CHECK-SVE-UF2-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-SVE-UF2-NEXT:    br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK-SVE-UF2:       for.body.preheader:
+; CHECK-SVE-UF2-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
+; CHECK-SVE-UF2-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF2-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
+; CHECK-SVE-UF2-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
+; CHECK-SVE-UF2-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 0)
+; CHECK-SVE-UF2-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 2)
+; CHECK-SVE-UF2-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF2-NEXT:    [[DOTIDX:%.*]] = shl nuw nsw i64 [[TMP4]], 4
+; CHECK-SVE-UF2-NEXT:    [[TMP5:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF2-NEXT:    [[DOTIDX5:%.*]] = shl nuw nsw i64 [[TMP5]], 4
+; CHECK-SVE-UF2-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-SVE-UF2:       vector.body:
+; CHECK-SVE-UF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-UF2-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[TMP2]], [[FOR_BODY_PREHEADER]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-UF2-NEXT:    [[ACTIVE_LANE_MASK2:%.*]] = phi <vscale x 2 x i1> [ [[TMP3]], [[FOR_BODY_PREHEADER]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-UF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw double, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-SVE-UF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP6]], i64 [[DOTIDX]]
+; CHECK-SVE-UF2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP6]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
+; CHECK-SVE-UF2-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = tail call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr nonnull [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 2 x double> poison)
+; CHECK-SVE-UF2-NEXT:    [[TMP8:%.*]] = fmul <vscale x 2 x double> [[WIDE_MASKED_LOAD]], splat (double 3.000000e+00)
+; CHECK-SVE-UF2-NEXT:    [[TMP9:%.*]] = fmul <vscale x 2 x double> [[WIDE_MASKED_LOAD3]], splat (double 3.000000e+00)
+; CHECK-SVE-UF2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw double, ptr [[DST]], i64 [[INDEX]]
+; CHECK-SVE-UF2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP10]], i64 [[DOTIDX5]]
+; CHECK-SVE-UF2-NEXT:    tail call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP8]], ptr [[TMP10]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-SVE-UF2-NEXT:    tail call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP9]], ptr [[TMP11]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
+; CHECK-SVE-UF2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
+; CHECK-SVE-UF2-NEXT:    [[ACTIVE_LANE_MASK_NEXT:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
+; CHECK-SVE-UF2-NEXT:    [[TMP12]] = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0)
+; CHECK-SVE-UF2-NEXT:    [[TMP13]] = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 2)
+; CHECK-SVE-UF2-NEXT:    [[TMP14:%.*]] = extractelement <vscale x 2 x i1> [[TMP12]], i64 0
+; CHECK-SVE-UF2-NEXT:    br i1 [[TMP14]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-SVE-UF2:       for.end:
+; CHECK-SVE-UF2-NEXT:    ret void
+;
+; CHECK-SVE-UF4-LABEL: define void @scalable_wide_active_lane_mask_float(
+; CHECK-SVE-UF4-SAME: ptr noalias writeonly captures(none) [[DST:%.*]], ptr readonly captures(none) [[SRC:%.*]], i32 [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-SVE-UF4-NEXT:  entry:
+; CHECK-SVE-UF4-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-SVE-UF4-NEXT:    br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK-SVE-UF4:       for.body.preheader:
+; CHECK-SVE-UF4-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
+; CHECK-SVE-UF4-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF4-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 3
+; CHECK-SVE-UF4-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
+; CHECK-SVE-UF4-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv8i1(<vscale x 8 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 0)
+; CHECK-SVE-UF4-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv8i1(<vscale x 8 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 2)
+; CHECK-SVE-UF4-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv8i1(<vscale x 8 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 4)
+; CHECK-SVE-UF4-NEXT:    [[TMP5:%.*]] = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv8i1(<vscale x 8 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 6)
+; CHECK-SVE-UF4-NEXT:    [[TMP6:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF4-NEXT:    [[DOTIDX:%.*]] = shl nuw nsw i64 [[TMP6]], 4
+; CHECK-SVE-UF4-NEXT:    [[TMP7:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF4-NEXT:    [[DOTIDX15:%.*]] = shl nuw nsw i64 [[TMP7]], 5
+; CHECK-SVE-UF4-NEXT:    [[TMP8:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF4-NEXT:    [[DOTIDX16:%.*]] = mul nuw nsw i64 [[TMP8]], 48
+; CHECK-SVE-UF4-NEXT:    [[TMP9:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF4-NEXT:    [[DOTIDX17:%.*]] = shl nuw nsw i64 [[TMP9]], 4
+; CHECK-SVE-UF4-NEXT:    [[TMP10:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF4-NEXT:    [[DOTIDX18:%.*]] = shl nuw nsw i64 [[TMP10]], 5
+; CHECK-SVE-UF4-NEXT:    [[TMP11:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF4-NEXT:    [[DOTIDX19:%.*]] = mul nuw nsw i64 [[TMP11]], 48
+; CHECK-SVE-UF4-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-SVE-UF4:       vector.body:
+; CHECK-SVE-UF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-UF4-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[TMP2]], [[FOR_BODY_PREHEADER]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-UF4-NEXT:    [[ACTIVE_LANE_MASK6:%.*]] = phi <vscale x 2 x i1> [ [[TMP3]], [[FOR_BODY_PREHEADER]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-UF4-NEXT:    [[ACTIVE_LANE_MASK7:%.*]] = phi <vscale x 2 x i1> [ [[TMP4]], [[FOR_BODY_PREHEADER]] ], [ [[TMP26:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-UF4-NEXT:    [[ACTIVE_LANE_MASK8:%.*]] = phi <vscale x 2 x i1> [ [[TMP5]], [[FOR_BODY_PREHEADER]] ], [ [[TMP27:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-UF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw double, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-SVE-UF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP12]], i64 [[DOTIDX]]
+; CHECK-SVE-UF4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP12]], i64 [[DOTIDX15]]
+; CHECK-SVE-UF4-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP12]], i64 [[DOTIDX16]]
+; CHECK-SVE-UF4-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP12]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
+; CHECK-SVE-UF4-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = tail call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr nonnull [[TMP13]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 2 x double> poison)
+; CHECK-SVE-UF4-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = tail call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr nonnull [[TMP14]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 2 x double> poison)
+; CHECK-SVE-UF4-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = tail call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr nonnull [[TMP15]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 2 x double> poison)
+; CHECK-SVE-UF4-NEXT:    [[TMP16:%.*]] = fmul <vscale x 2 x double> [[WIDE_MASKED_LOAD]], splat (double 3.000000e+00)
+; CHECK-SVE-UF4-NEXT:    [[TMP17:%.*]] = fmul <vscale x 2 x double> [[WIDE_MASKED_LOAD9]], splat (double 3.000000e+00)
+; CHECK-SVE-UF4-NEXT:    [[TMP18:%.*]] = fmul <vscale x 2 x double> [[WIDE_MASKED_LOAD10]], splat (double 3.000000e+00)
+; CHECK-SVE-UF4-NEXT:    [[TMP19:%.*]] = fmul <vscale x 2 x double> [[WIDE_MASKED_LOAD11]], splat (double 3.000000e+00)
+; CHECK-SVE-UF4-NEXT:    [[TMP20:%.*]] = getelementptr inbounds nuw double, ptr [[DST]], i64 [[INDEX]]
+; CHECK-SVE-UF4-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP20]], i64 [[DOTIDX17]]
+; CHECK-SVE-UF4-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP20]], i64 [[DOTIDX18]]
+; CHECK-SVE-UF4-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP20]], i64 [[DOTIDX19]]
+; CHECK-SVE-UF4-NEXT:    tail call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP16]], ptr [[TMP20]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-SVE-UF4-NEXT:    tail call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP17]], ptr [[TMP21]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK6]])
+; CHECK-SVE-UF4-NEXT:    tail call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP18]], ptr [[TMP22]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK7]])
+; CHECK-SVE-UF4-NEXT:    tail call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP19]], ptr [[TMP23]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK8]])
+; CHECK-SVE-UF4-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
+; CHECK-SVE-UF4-NEXT:    [[ACTIVE_LANE_MASK_NEXT:%.*]] = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
+; CHECK-SVE-UF4-NEXT:    [[TMP24]] = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv8i1(<vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0)
+; CHECK-SVE-UF4-NEXT:    [[TMP25]] = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv8i1(<vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 2)
+; CHECK-SVE-UF4-NEXT:    [[TMP26]] = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv8i1(<vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 4)
+; CHECK-SVE-UF4-NEXT:    [[TMP27]] = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv8i1(<vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 6)
+; CHECK-SVE-UF4-NEXT:    [[TMP28:%.*]] = extractelement <vscale x 2 x i1> [[TMP24]], i64 0
+; CHECK-SVE-UF4-NEXT:    br i1 [[TMP28]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-SVE-UF4:       for.end:
+; CHECK-SVE-UF4-NEXT:    ret void
+;
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body.preheader, label %for.end
+
+for.body.preheader:
+  %wide.trip.count = zext nneg i32 %n to i64
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.body ]
+  %arrayidx1 = getelementptr inbounds double, ptr %src, i64 %iv
+  %ld = load double, ptr %arrayidx1
+  %mul = fmul double %ld, 3.000000e+00
+  %arrayidx2 = getelementptr inbounds double, ptr %dst, i64 %iv
+  store double %mul, ptr %arrayidx2
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+attributes #0 = { nounwind vscale_range(1,16) "target-features"="+sve2p1" }
+
+;.
+; CHECK-SVE-UF0: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK-SVE-UF0: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-SVE-UF0: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-SVE-UF0: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
+;.
+; CHECK-SVE-UF2: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK-SVE-UF2: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-SVE-UF2: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-SVE-UF2: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
+;.
+; CHECK-SVE-UF4: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK-SVE-UF4: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-SVE-UF4: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-SVE-UF4: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/active-lane-mask.ll b/llvm/test/Transforms/LoopVectorize/ARM/active-lane-mask.ll
new file mode 100644
index 0000000000000..d20216654c3b4
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/ARM/active-lane-mask.ll
@@ -0,0 +1,88 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=loop-vectorize -tail-predication=enabled -prefer-predicate-over-epilogue=predicate-dont-vectorize -enable-wide-lane-mask -S | FileCheck %s
+
+target triple = "thumbv8.1m.main-arm-unknown-eabihf"
+
+define void @f0(ptr noalias %dst, ptr readonly %src, i64 %n) #0 {
+; CHECK-LABEL: define void @f0(
+; CHECK-SAME: ptr noalias [[DST:%.*]], ptr readonly [[SRC:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[VAL:%.*]] = icmp sgt i64 [[N]], 0
+; CHECK-NEXT:    br i1 [[VAL]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_END:.*]]
+; CHECK:       [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], 31
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 32
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 16
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 [[INDEX]], i64 [[N]])
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK1:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 [[TMP0]], i64 [[N]])
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP2]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP3]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK1]], <16 x i8> poison)
+; CHECK-NEXT:    [[TMP4:%.*]] = mul <16 x i8> [[WIDE_MASKED_LOAD]], splat (i8 3)
+; CHECK-NEXT:    [[TMP5:%.*]] = mul <16 x i8> [[WIDE_MASKED_LOAD2]], splat (i8 3)
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 16
+; CHECK-NEXT:    call void @llvm.masked.store.v16i8.p0(<16 x i8> [[TMP4]], ptr [[TMP7]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.v16i8.p0(<16 x i8> [[TMP5]], ptr [[TMP8]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK1]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 32
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[FOR_END_LOOPEXIT:.*]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[MUL:%.*]] = mul i8 [[TMP10]], 3
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i8 [[MUL]], ptr [[ARRAYIDX3]], align 1
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[FOR_END_LOOPEXIT]]:
+; CHECK-NEXT:    br label %[[FOR_END]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %val = icmp sgt i64 %n, 0
+  br i1 %val, label %for.body, label %for.end
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i8, ptr %src, i64 %indvars.iv
+  %0 = load i8, ptr %arrayidx, align 1
+  %mul = mul i8 %0, 3
+  %arrayidx3 = getelementptr inbounds i8, ptr %dst, i64 %indvars.iv
+  store i8 %mul, ptr %arrayidx3, align 1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %n
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !1
+
+for.end:
+  ret void
+}
+
+attributes #0 = { nofree norecurse nounwind "target-features"="+armv8.1-m.main,+mve.fp" }
+
+!1 = distinct !{!1, !2, !3}
+!2 = !{!"llvm.loop.vectorize.width", i32 16}
+!3 = !{!"llvm.loop.interleave.count", i32 2}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.