diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 63fccee63c0ae..1dff9c3513a28 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -163,6 +163,7 @@ bool llvm::isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID,
   case Intrinsic::is_fpclass:
   case Intrinsic::vp_is_fpclass:
   case Intrinsic::powi:
+  case Intrinsic::vector_extract:
     return (ScalarOpdIdx == 1);
   case Intrinsic::smul_fix:
   case Intrinsic::smul_fix_sat:
@@ -195,6 +196,7 @@ bool llvm::isVectorIntrinsicWithOverloadTypeAtArg(
   case Intrinsic::vp_llrint:
   case Intrinsic::ucmp:
   case Intrinsic::scmp:
+  case Intrinsic::vector_extract:
     return OpdIdx == -1 || OpdIdx == 0;
   case Intrinsic::modf:
   case Intrinsic::sincos:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 144f35e10132f..dd54d964f8883 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -44,6 +44,7 @@ class VPRecipeBuilder;
 struct VFRange;
 
 extern cl::opt<bool> EnableVPlanNativePath;
+extern cl::opt<bool> EnableWideActiveLaneMask;
 extern cl::opt<unsigned> ForceTargetInstructionCost;
 
 /// VPlan-based builder utility analogous to IRBuilder.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index e7bae17dd2ceb..6e5f4caf93d23 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -356,6 +356,10 @@ cl::opt<bool> llvm::EnableVPlanNativePath(
     cl::desc("Enable VPlan-native vectorization path with "
              "support for outer loop vectorization."));
 
+cl::opt<bool> llvm::EnableWideActiveLaneMask(
+    "enable-wide-lane-mask", cl::init(false), cl::Hidden,
+    cl::desc("Enable use of wide get active lane mask instructions"));
+
 cl::opt<bool>
     llvm::VerifyEachVPlan("vplan-verify-each",
 #ifdef EXPENSIVE_CHECKS
@@ -7328,7 +7332,10 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
     VPlanTransforms::runPass(VPlanTransforms::addBranchWeightToMiddleTerminator,
                              BestVPlan, BestVF, VScale);
   }
-  VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
+  VPlanTransforms::optimizeForVFAndUF(
+      BestVPlan, BestVF, BestUF, PSE,
+      ILV.Cost->getTailFoldingStyle() ==
+          TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck);
   VPlanTransforms::simplifyRecipes(BestVPlan, *Legal->getWidestInductionType());
   VPlanTransforms::narrowInterleaveGroups(
       BestVPlan, BestVF,
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 356af4a0e74e4..6080aa88ec306 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -954,6 +954,9 @@ class VPInstruction : public VPRecipeWithIRFlags,
     // part if it is scalar. In the latter case, the recipe will be removed
     // during unrolling.
     ExtractPenultimateElement,
+    // Extracts a subvector from a vector (first operand) starting at a given
+    // offset (second operand).
+    ExtractSubvector,
     LogicalAnd, // Non-poison propagating logical And.
     // Add an offset in bytes (second operand) to a base pointer (first
     // operand). Only generates scalar values (either for the first lane only or
@@ -1887,6 +1890,9 @@ class VPHeaderPHIRecipe : public VPSingleDefRecipe, public VPPhiAccessors {
     return getOperand(1);
   }
 
+  // Update the incoming value from the loop backedge.
+  void setBackedgeValue(VPValue *V) { setOperand(1, V); }
+
   /// Returns the backedge value as a recipe. The backedge value is guaranteed
   /// to be a recipe.
   virtual VPRecipeBase &getBackedgeRecipe() {
@@ -3234,10 +3240,12 @@ class VPCanonicalIVPHIRecipe : public VPHeaderPHIRecipe {
 /// TODO: It would be good to use the existing VPWidenPHIRecipe instead and
 /// remove VPActiveLaneMaskPHIRecipe.
 class VPActiveLaneMaskPHIRecipe : public VPHeaderPHIRecipe {
+  unsigned UnrollPart = 0;
+
 public:
-  VPActiveLaneMaskPHIRecipe(VPValue *StartMask, DebugLoc DL)
-      : VPHeaderPHIRecipe(VPDef::VPActiveLaneMaskPHISC, nullptr, StartMask,
-                          DL) {}
+  VPActiveLaneMaskPHIRecipe(VPValue *StartMask, DebugLoc DL, unsigned Part = 0)
+      : VPHeaderPHIRecipe(VPDef::VPActiveLaneMaskPHISC, nullptr, StartMask, DL),
+        UnrollPart(Part) {}
 
   ~VPActiveLaneMaskPHIRecipe() override = default;
 
@@ -3250,6 +3258,9 @@ class VPActiveLaneMaskPHIRecipe : public VPHeaderPHIRecipe {
 
   VP_CLASSOF_IMPL(VPDef::VPActiveLaneMaskPHISC)
 
+  unsigned getUnrollPart() { return UnrollPart; }
+  void setUnrollPart(unsigned Part) { UnrollPart = Part; }
+
   /// Generate the active lane mask phi of the vector loop.
   void execute(VPTransformState &State) override;
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 92db9674ef42b..5e7f797b70978 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -74,6 +74,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
   switch (Opcode) {
   case Instruction::ExtractElement:
   case Instruction::Freeze:
+  case VPInstruction::ExtractSubvector:
   case VPInstruction::ReductionStartVector:
     return inferScalarType(R->getOperand(0));
   case Instruction::Select: {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
index efea99f22d086..62898bf2c1991 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
@@ -384,10 +384,11 @@ m_Broadcast(const Op0_t &Op0) {
   return m_VPInstruction<VPInstruction::Broadcast>(Op0);
 }
 
-template <typename Op0_t, typename Op1_t>
-inline BinaryVPInstruction_match<Op0_t, Op1_t, VPInstruction::ActiveLaneMask>
-m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1) {
-  return m_VPInstruction<VPInstruction::ActiveLaneMask>(Op0, Op1);
+template <typename Op0_t, typename Op1_t, typename Op2_t>
+inline TernaryVPInstruction_match<Op0_t, Op1_t, Op2_t,
+                                  VPInstruction::ActiveLaneMask>
+m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2) {
+  return m_VPInstruction<VPInstruction::ActiveLaneMask>(Op0, Op1, Op2);
 }
 
 template <typename Op0_t, typename Op1_t>
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index ccb7512051d77..c776d5cb91278 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -469,15 +469,16 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
   case Instruction::ICmp:
   case Instruction::FCmp:
   case Instruction::Store:
-  case VPInstruction::ActiveLaneMask:
   case VPInstruction::BranchOnCount:
   case VPInstruction::ComputeReductionResult:
+  case VPInstruction::ExtractSubvector:
   case VPInstruction::FirstOrderRecurrenceSplice:
   case VPInstruction::LogicalAnd:
   case VPInstruction::PtrAdd:
   case VPInstruction::WideIVStep:
     return 2;
   case Instruction::Select:
+  case VPInstruction::ActiveLaneMask:
   case VPInstruction::ComputeAnyOfResult:
   case VPInstruction::ReductionStartVector:
     return 3;
@@ -614,7 +615,9 @@ Value *VPInstruction::generate(VPTransformState &State) {
                                Name);
 
     auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
-    auto *PredTy = VectorType::get(Int1Ty, State.VF);
+    auto PredTy = VectorType::get(
+        Int1Ty, State.VF * cast<ConstantInt>(getOperand(2)->getLiveInIRValue())
+                               ->getZExtValue());
     return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
                                    {PredTy, ScalarTC->getType()},
                                    {VIVElem0, ScalarTC}, nullptr, Name);
@@ -846,6 +849,14 @@ Value *VPInstruction::generate(VPTransformState &State) {
       Res->setName(Name);
     return Res;
   }
+  case VPInstruction::ExtractSubvector: {
+    Value *Vec = State.get(getOperand(0));
+    assert(State.VF.isVector());
+    auto Idx = cast<ConstantInt>(getOperand(1)->getLiveInIRValue());
+    auto ResTy = VectorType::get(
+        State.TypeAnalysis.inferScalarType(getOperand(0)), State.VF);
+    return Builder.CreateExtractVector(ResTy, Vec, Idx);
+  }
   case VPInstruction::LogicalAnd: {
     Value *A = State.get(getOperand(0));
     Value *B = State.get(getOperand(1));
@@ -1044,6 +1055,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
   case VPInstruction::CanonicalIVIncrementForPart:
   case VPInstruction::ExtractLastElement:
   case VPInstruction::ExtractPenultimateElement:
+  case VPInstruction::ExtractSubvector:
   case VPInstruction::FirstActiveLane:
   case VPInstruction::FirstOrderRecurrenceSplice:
   case VPInstruction::LogicalAnd:
@@ -1174,6 +1186,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
   case VPInstruction::ExtractPenultimateElement:
     O << "extract-penultimate-element";
     break;
+  case VPInstruction::ExtractSubvector:
+    O << "extract-subvector";
+    break;
   case VPInstruction::ComputeAnyOfResult:
     O << "compute-anyof-result";
     break;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 90137b72c83fb..b8f14ca88e8a3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "VPlanTransforms.h"
+#include "LoopVectorizationPlanner.h"
 #include "VPRecipeBuilder.h"
 #include "VPlan.h"
 #include "VPlanAnalysis.h"
@@ -1432,20 +1433,93 @@ static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan,
   return SE.isKnownPredicate(CmpInst::ICMP_EQ, TripCount, C);
 }
 
+static void extractFromWideActiveLaneMask(VPlan &Plan, ElementCount VF,
+                                          unsigned UF) {
+  VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
+  auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
+  VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
+  auto *Term = &ExitingVPBB->back();
+
+  VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
+  LLVMContext &Ctx = CanonicalIV->getScalarType()->getContext();
+  using namespace llvm::VPlanPatternMatch;
+
+  auto extractFromALM = [&](VPInstruction *ALM, VPInstruction *InsBefore,
+                            SmallVectorImpl<VPValue *> &Extracts) {
+    VPBuilder Builder(InsBefore);
+    DebugLoc DL = ALM->getDebugLoc();
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      SmallVector<VPValue *> Ops;
+      Ops.append({ALM, Plan.getOrAddLiveIn(
+                           ConstantInt::get(IntegerType::getInt64Ty(Ctx),
+                                            VF.getKnownMinValue() * Part))});
+      Extracts.push_back(
+          Builder.createNaryOp(VPInstruction::ExtractSubvector, Ops, DL));
+    }
+  };
+
+  // Create a list of each active lane mask phi, ordered by unroll part.
+  SmallVector<VPActiveLaneMaskPHIRecipe *> Phis(UF, nullptr);
+  for (VPRecipeBase &R : Header->phis())
+    if (auto *Phi = dyn_cast<VPActiveLaneMaskPHIRecipe>(&R))
+      Phis[Phi->getUnrollPart()] = Phi;
+
+  assert(all_of(Phis, [](VPActiveLaneMaskPHIRecipe *Phi) { return Phi; }) &&
+         "Expected one VPActiveLaneMaskPHIRecipe for each unroll part");
+
+  // When using wide lane masks, the return type of the get.active.lane.mask
+  // intrinsic is VF x UF (second operand).
+  VPValue *ALMMultiplier =
+      Plan.getOrAddLiveIn(ConstantInt::get(IntegerType::getInt64Ty(Ctx), UF));
+  cast<VPInstruction>(Phis[0]->getStartValue())->setOperand(2, ALMMultiplier);
+  cast<VPInstruction>(Phis[0]->getBackedgeValue())
+      ->setOperand(2, ALMMultiplier);
+
+  // Create UF x extract vectors and insert into preheader.
+  SmallVector<VPValue *> EntryExtracts;
+  auto *EntryALM = cast<VPInstruction>(Phis[0]->getStartValue());
+  extractFromALM(EntryALM, cast<VPInstruction>(&EntryALM->getParent()->back()),
+                 EntryExtracts);
+
+  // Create UF x extract vectors and insert before the loop compare & branch,
+  // updating the compare to use the first extract.
+  SmallVector<VPValue *> LoopExtracts;
+  auto *LoopALM = cast<VPInstruction>(Phis[0]->getBackedgeValue());
+  VPInstruction *Not = cast<VPInstruction>(Term->getOperand(0));
+  extractFromALM(LoopALM, Not, LoopExtracts);
+  Not->setOperand(0, LoopExtracts[0]);
+
+  // Update the incoming values of active lane mask phis.
+  for (unsigned Part = 0; Part < UF; ++Part) {
+    Phis[Part]->setStartValue(EntryExtracts[Part]);
+    Phis[Part]->setBackedgeValue(LoopExtracts[Part]);
+  }
+
+  return;
+}
+
 /// Try to simplify the branch condition of \p Plan. This may restrict the
 /// resulting plan to \p BestVF and \p BestUF.
-static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF,
-                                              unsigned BestUF,
-                                              PredicatedScalarEvolution &PSE) {
+static bool
+simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF,
+                                  unsigned BestUF,
+                                  PredicatedScalarEvolution &PSE,
+                                  bool DataAndControlFlowWithoutRuntimeCheck) {
   VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
   VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
   auto *Term = &ExitingVPBB->back();
   VPValue *Cond;
   ScalarEvolution &SE = *PSE.getSE();
   using namespace llvm::VPlanPatternMatch;
-  if (match(Term, m_BranchOnCount(m_VPValue(), m_VPValue())) ||
-      match(Term, m_BranchOnCond(
-                      m_Not(m_ActiveLaneMask(m_VPValue(), m_VPValue()))))) {
+  auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
+  bool BranchALM = match(Term, m_BranchOnCond(m_Not(m_ActiveLaneMask(
+                                   m_VPValue(), m_VPValue(), m_VPValue()))));
+
+  if (BranchALM || match(Term, m_BranchOnCount(m_VPValue(), m_VPValue()))) {
+    if (BranchALM && DataAndControlFlowWithoutRuntimeCheck &&
+        EnableWideActiveLaneMask && BestVF.isVector() && BestUF > 1)
+      extractFromWideActiveLaneMask(Plan, BestVF, BestUF);
+
     // Try to simplify the branch condition if TC <= VF * UF when the latch
     // terminator is   BranchOnCount or BranchOnCond where the input is
     // Not(ActiveLaneMask).
@@ -1470,7 +1544,6 @@ static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF,
   // The vector loop region only executes once. If possible, completely remove
   // the region, otherwise replace the terminator controlling the latch with
   // (BranchOnCond true).
-  auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
   auto *CanIVTy = Plan.getCanonicalIV()->getScalarType();
   if (all_of(
           Header->phis(),
@@ -1507,14 +1580,15 @@ static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF,
   return true;
 }
 
-void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
-                                         unsigned BestUF,
-                                         PredicatedScalarEvolution &PSE) {
+void VPlanTransforms::optimizeForVFAndUF(
+    VPlan &Plan, ElementCount BestVF, unsigned BestUF,
+    PredicatedScalarEvolution &PSE,
+    bool DataAndControlFlowWithoutRuntimeCheck) {
   assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
   assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
 
-  bool MadeChange =
-      simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
+  bool MadeChange = simplifyBranchConditionForVFAndUF(
+      Plan, BestVF, BestUF, PSE, DataAndControlFlowWithoutRuntimeCheck);
   MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF);
 
   if (MadeChange) {
@@ -2006,9 +2080,11 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
       "index.part.next");
 
   // Create the active lane mask instruction in the VPlan preheader.
-  auto *EntryALM =
-      Builder.createNaryOp(VPInstruction::ActiveLaneMask, {EntryIncrement, TC},
-                           DL, "active.lane.mask.entry");
+  VPValue *ALMMultiplier = Plan.getOrAddLiveIn(
+      ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1));
+  auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
+                                        {EntryIncrement, TC, ALMMultiplier}, DL,
+                                        "active.lane.mask.entry");
 
   // Now create the ActiveLaneMaskPhi recipe in the main loop using the
   // preheader ActiveLaneMask instruction.
@@ -2023,8 +2099,8 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
       Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart,
                                   {IncrementValue}, {false, false}, DL);
   auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
-                                   {InLoopIncrement, TripCount}, DL,
-                                   "active.lane.mask.next");
+                                   {InLoopIncrement, TripCount, ALMMultiplier},
+                                   DL, "active.lane.mask.next");
   LaneMaskPhi->addOperand(ALM);
 
   // Replace the original terminator with BranchOnCond. We have to invert the
@@ -2101,9 +2177,12 @@ void VPlanTransforms::addActiveLaneMask(
         Plan, DataAndControlFlowWithoutRuntimeCheck);
   } else {
     VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
-    LaneMask = B.createNaryOp(VPInstruction::ActiveLaneMask,
-                              {WideCanonicalIV, Plan.getTripCount()}, nullptr,
-                              "active.lane.mask");
+    VPValue *ALMMultiplier = Plan.getOrAddLiveIn(
+        ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1));
+    LaneMask =
+        B.createNaryOp(VPInstruction::ActiveLaneMask,
+                       {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier},
+                       nullptr, "active.lane.mask");
   }
 
   // Walk users of WideCanonicalIV and replace all compares of the form
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 8d2eded45da22..920c7aa32cc97 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -109,7 +109,8 @@ struct VPlanTransforms {
   /// resulting plan to \p BestVF and \p BestUF.
   static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
                                  unsigned BestUF,
-                                 PredicatedScalarEvolution &PSE);
+                                 PredicatedScalarEvolution &PSE,
+                                 bool DataAndControlFlowWithoutRuntimeCheck);
 
   /// Apply VPlan-to-VPlan optimizations to \p Plan, including induction recipe
   /// optimizations, dead recipe removal, replicate region optimizations and
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index 2dd43c092ff7a..76a37d5ba839b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -250,6 +250,7 @@ void UnrollState::unrollHeaderPHIByUF(VPHeaderPHIRecipe *R,
     } else {
       assert(isa<VPActiveLaneMaskPHIRecipe>(R) &&
              "unexpected header phi recipe not needing unrolled part");
+      cast<VPActiveLaneMaskPHIRecipe>(Copy)->setUnrollPart(Part);
     }
   }
 }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
index 81bd21bb904c0..9fdc199fc1dfa 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -61,7 +61,7 @@ bool vputils::isHeaderMask(const VPValue *V, VPlan &Plan) {
   VPValue *A, *B;
   using namespace VPlanPatternMatch;
 
-  if (match(V, m_ActiveLaneMask(m_VPValue(A), m_VPValue(B))))
+  if (match(V, m_ActiveLaneMask(m_VPValue(A), m_VPValue(B), m_SpecificInt(1))))
     return B == Plan.getTripCount() &&
            (match(A, m_ScalarIVSteps(m_Specific(Plan.getCanonicalIV()),
                                      m_SpecificInt(1),
diff --git a/llvm/test/CodeGen/AArch64/sve-wide-lane-mask.ll b/llvm/test/CodeGen/AArch64/sve-wide-lane-mask.ll
new file mode 100644
index 0000000000000..d59dbec491467
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-wide-lane-mask.ll
@@ -0,0 +1,121 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mattr=+sve    < %s | FileCheck %s -check-prefix CHECK-SVE
+; RUN: llc -mattr=+sve2p1 < %s | FileCheck %s -check-prefix CHECK-SVE2p1
+
+target triple = "aarch64-unknown-linux"
+
+define void @scalable_wide_active_lane_mask(ptr %dst, ptr %src, i64 %n) #0 {
+; CHECK-SVE-LABEL: scalable_wide_active_lane_mask:
+; CHECK-SVE:       // %bb.0: // %entry
+; CHECK-SVE-NEXT:    cmp x2, #1
+; CHECK-SVE-NEXT:    b.lt .LBB0_3
+; CHECK-SVE-NEXT:  // %bb.1: // %vector.ph
+; CHECK-SVE-NEXT:    rdvl x8, #2
+; CHECK-SVE-NEXT:    rdvl x9, #1
+; CHECK-SVE-NEXT:    mov x11, xzr
+; CHECK-SVE-NEXT:    subs x10, x2, x8
+; CHECK-SVE-NEXT:    csel x10, xzr, x10, lo
+; CHECK-SVE-NEXT:    whilelo p1.b, xzr, x2
+; CHECK-SVE-NEXT:    whilelo p0.b, x9, x2
+; CHECK-SVE-NEXT:  .LBB0_2: // %vector.body
+; CHECK-SVE-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-SVE-NEXT:    add x12, x1, x11
+; CHECK-SVE-NEXT:    ld1b { z0.b }, p1/z, [x1, x11]
+; CHECK-SVE-NEXT:    add x13, x0, x11
+; CHECK-SVE-NEXT:    ld1b { z1.b }, p0/z, [x12, #1, mul vl]
+; CHECK-SVE-NEXT:    adds x12, x11, x9
+; CHECK-SVE-NEXT:    csinv x12, x12, xzr, lo
+; CHECK-SVE-NEXT:    mul z0.b, z0.b, #3
+; CHECK-SVE-NEXT:    mul z1.b, z1.b, #3
+; CHECK-SVE-NEXT:    st1b { z0.b }, p1, [x0, x11]
+; CHECK-SVE-NEXT:    st1b { z1.b }, p0, [x13, #1, mul vl]
+; CHECK-SVE-NEXT:    whilelo p0.b, x12, x10
+; CHECK-SVE-NEXT:    whilelo p1.b, x11, x10
+; CHECK-SVE-NEXT:    add x11, x11, x8
+; CHECK-SVE-NEXT:    b.mi .LBB0_2
+; CHECK-SVE-NEXT:  .LBB0_3: // %for.end
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-SVE2p1-LABEL: scalable_wide_active_lane_mask:
+; CHECK-SVE2p1:       // %bb.0: // %entry
+; CHECK-SVE2p1-NEXT:    cmp x2, #1
+; CHECK-SVE2p1-NEXT:    b.lt .LBB0_3
+; CHECK-SVE2p1-NEXT:  // %bb.1: // %vector.ph
+; CHECK-SVE2p1-NEXT:    rdvl x9, #2
+; CHECK-SVE2p1-NEXT:    mov x8, xzr
+; CHECK-SVE2p1-NEXT:    subs x9, x2, x9
+; CHECK-SVE2p1-NEXT:    csel x9, xzr, x9, lo
+; CHECK-SVE2p1-NEXT:    whilelo { p0.b, p1.b }, xzr, x2
+; CHECK-SVE2p1-NEXT:  .LBB0_2: // %vector.body
+; CHECK-SVE2p1-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-SVE2p1-NEXT:    add x10, x1, x8
+; CHECK-SVE2p1-NEXT:    ld1b { z0.b }, p0/z, [x1, x8]
+; CHECK-SVE2p1-NEXT:    ld1b { z1.b }, p1/z, [x10, #1, mul vl]
+; CHECK-SVE2p1-NEXT:    add x10, x0, x8
+; CHECK-SVE2p1-NEXT:    mul z0.b, z0.b, #3
+; CHECK-SVE2p1-NEXT:    mul z1.b, z1.b, #3
+; CHECK-SVE2p1-NEXT:    st1b { z0.b }, p0, [x0, x8]
+; CHECK-SVE2p1-NEXT:    st1b { z1.b }, p1, [x10, #1, mul vl]
+; CHECK-SVE2p1-NEXT:    whilelo { p0.b, p1.b }, x8, x9
+; CHECK-SVE2p1-NEXT:    incb x8, all, mul #2
+; CHECK-SVE2p1-NEXT:    mov z0.b, p0/z, #1 // =0x1
+; CHECK-SVE2p1-NEXT:    fmov w10, s0
+; CHECK-SVE2p1-NEXT:    tbnz w10, #0, .LBB0_2
+; CHECK-SVE2p1-NEXT:  .LBB0_3: // %for.end
+; CHECK-SVE2p1-NEXT:    ret
+entry:
+  %cmp = icmp sgt i64 %n, 0
+  br i1 %cmp, label %vector.ph, label %for.end
+
+vector.ph:
+  %0 = tail call i64 @llvm.vscale.i64()
+  %1 = shl nuw nsw i64 %0, 5
+  %2 = tail call i64 @llvm.vscale.i64()
+  %3 = shl nuw nsw i64 %2, 5
+  %4 = tail call i64 @llvm.usub.sat.i64(i64 %n, i64 %3)
+  %active.lane.mask.entry = tail call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 0, i64 %n)
+  %5 = tail call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> %active.lane.mask.entry, i64 0)
+  %6 = tail call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> %active.lane.mask.entry, i64 16)
+  %7 = tail call i64 @llvm.vscale.i64()
+  %8 = shl nuw nsw i64 %7, 4
+  %9 = tail call i64 @llvm.vscale.i64()
+  %10 = shl nuw nsw i64 %9, 4
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %active.lane.mask = phi <vscale x 16 x i1> [ %5, %vector.ph ], [ %17, %vector.body ]
+  %active.lane.mask2 = phi <vscale x 16 x i1> [ %6, %vector.ph ], [ %18, %vector.body ]
+  %11 = getelementptr inbounds nuw i8, ptr %src, i64 %index
+  %12 = getelementptr inbounds nuw i8, ptr %11, i64 %8
+  %wide.masked.load = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr %11, i32 1, <vscale x 16 x i1> %active.lane.mask, <vscale x 16 x i8> poison)
+  %wide.masked.load3 = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr nonnull %12, i32 1, <vscale x 16 x i1> %active.lane.mask2, <vscale x 16 x i8> poison)
+  %13 = mul <vscale x 16 x i8> %wide.masked.load, splat (i8 3)
+  %14 = mul <vscale x 16 x i8> %wide.masked.load3, splat (i8 3)
+  %15 = getelementptr inbounds nuw i8, ptr %dst, i64 %index
+  %16 = getelementptr inbounds nuw i8, ptr %15, i64 %10
+  tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %13, ptr %15, i32 1, <vscale x 16 x i1> %active.lane.mask)
+  tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %14, ptr %16, i32 1, <vscale x 16 x i1> %active.lane.mask2)
+  %index.next = add i64 %index, %1
+  %active.lane.mask.next = tail call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 %index, i64 %4)
+  %17 = tail call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> %active.lane.mask.next, i64 0)
+  %18 = tail call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> %active.lane.mask.next, i64 16)
+  %19 = extractelement <vscale x 16 x i1> %17, i64 0
+  br i1 %19, label %vector.body, label %for.end, !llvm.loop !0
+
+for.end:
+  ret void
+}
+
+declare i64 @llvm.vscale.i64()
+declare <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64, i64)
+declare <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1>, i64 immarg)
+declare <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr captures(none), i32 immarg, <vscale x 16 x i1>, <vscale x 16 x i8>)
+declare void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8>, ptr captures(none), i32 immarg, <vscale x 16 x i1>)
+declare i64 @llvm.usub.sat.i64(i64, i64)
+
+attributes #0 = {  vscale_range(1,16) }
+
+!0 = distinct !{!0, !1, !2}
+!1 = !{!"llvm.loop.isvectorized", i32 1}
+!2 = !{!"llvm.loop.unroll.runtime.disable"}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fixed-wide-lane-mask.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fixed-wide-lane-mask.ll
new file mode 100644
index 0000000000000..52128de119b9b
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/fixed-wide-lane-mask.ll
@@ -0,0 +1,129 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S --passes="default<O3>" -scalable-vectorization=off -prefer-predicate-over-epilogue=predicate-dont-vectorize -enable-wide-lane-mask \
+; RUN:    -force-vector-width=4 -force-vector-interleave=0 < %s | FileCheck %s -check-prefix CHECK-UF0
+; RUN: opt -S --passes="default<O3>" -scalable-vectorization=off -prefer-predicate-over-epilogue=predicate-dont-vectorize -enable-wide-lane-mask \
+; RUN:    -force-vector-width=4 -force-vector-interleave=2 < %s | FileCheck %s -check-prefix CHECK-UF2
+; RUN: opt -S --passes="default<O3>" -scalable-vectorization=off -prefer-predicate-over-epilogue=predicate-dont-vectorize -enable-wide-lane-mask \
+; RUN:    -force-vector-width=4 -force-vector-interleave=4 < %s | FileCheck %s -check-prefix CHECK-UF4
+
+target triple = "aarch64-unknown-linux"
+
+define void @fixed_wide_active_lane_mask(ptr noalias %dst, ptr noalias readonly %src, i64 %n) #0 {
+; CHECK-UF0-LABEL: define void @fixed_wide_active_lane_mask(
+; CHECK-UF0-SAME: ptr noalias writeonly captures(none) [[DST:%.*]], ptr noalias readonly captures(none) [[SRC:%.*]], i64 [[N:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-UF0-NEXT:  entry:
+; CHECK-UF0-NEXT:    [[LD:%.*]] = load i32, ptr [[SRC]], align 4
+; CHECK-UF0-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.usub.sat.i64(i64 [[N]], i64 4)
+; CHECK-UF0-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 0, i64 [[N]])
+; CHECK-UF0-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[LD]], i64 0
+; CHECK-UF0-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-UF0-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-UF0:       vector.body:
+; CHECK-UF0-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-UF0-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-UF0-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i32, ptr [[DST]], i64 [[INDEX]]
+; CHECK-UF0-NEXT:    tail call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP2]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-UF0-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-UF0-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = tail call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 [[TMP0]])
+; CHECK-UF0-NEXT:    [[TMP3:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; CHECK-UF0-NEXT:    br i1 [[TMP3]], label [[VECTOR_BODY]], label [[FOR_END:%.*]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-UF0:       for.end:
+; CHECK-UF0-NEXT:    ret void
+;
+; CHECK-UF2-LABEL: define void @fixed_wide_active_lane_mask(
+; CHECK-UF2-SAME: ptr noalias writeonly captures(none) [[DST:%.*]], ptr noalias readonly captures(none) [[SRC:%.*]], i64 [[N:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-UF2-NEXT:  entry:
+; CHECK-UF2-NEXT:    [[LD:%.*]] = load i32, ptr [[SRC]], align 4
+; CHECK-UF2-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.usub.sat.i64(i64 [[N]], i64 8)
+; CHECK-UF2-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 0, i64 [[N]])
+; CHECK-UF2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i1> [[ACTIVE_LANE_MASK_ENTRY]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-UF2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i1> [[ACTIVE_LANE_MASK_ENTRY]], <8 x i1> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-UF2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[LD]], i64 0
+; CHECK-UF2-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-UF2-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-UF2:       vector.body:
+; CHECK-UF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-UF2-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[TMP1]], [[ENTRY]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-UF2-NEXT:    [[ACTIVE_LANE_MASK2:%.*]] = phi <4 x i1> [ [[TMP2]], [[ENTRY]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
+; CHECK-UF2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i32, ptr [[DST]], i64 [[INDEX]]
+; CHECK-UF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP3]], i64 16
+; CHECK-UF2-NEXT:    tail call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-UF2-NEXT:    tail call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP4]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK2]])
+; CHECK-UF2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
+; CHECK-UF2-NEXT:    [[ACTIVE_LANE_MASK_NEXT:%.*]] = tail call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 [[INDEX]], i64 [[TMP0]])
+; CHECK-UF2-NEXT:    [[TMP5]] = shufflevector <8 x i1> [[ACTIVE_LANE_MASK_NEXT]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-UF2-NEXT:    [[TMP6]] = shufflevector <8 x i1> [[ACTIVE_LANE_MASK_NEXT]], <8 x i1> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-UF2-NEXT:    [[TMP7:%.*]] = extractelement <8 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; CHECK-UF2-NEXT:    br i1 [[TMP7]], label [[VECTOR_BODY]], label [[FOR_END:%.*]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-UF2:       for.end:
+; CHECK-UF2-NEXT:    ret void
+;
+; CHECK-UF4-LABEL: define void @fixed_wide_active_lane_mask(
+; CHECK-UF4-SAME: ptr noalias writeonly captures(none) [[DST:%.*]], ptr noalias readonly captures(none) [[SRC:%.*]], i64 [[N:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-UF4-NEXT:  entry:
+; CHECK-UF4-NEXT:    [[LD:%.*]] = load i32, ptr [[SRC]], align 4
+; CHECK-UF4-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.usub.sat.i64(i64 [[N]], i64 16)
+; CHECK-UF4-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 0, i64 [[N]])
+; CHECK-UF4-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i1> [[ACTIVE_LANE_MASK_ENTRY]], <16 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-UF4-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i1> [[ACTIVE_LANE_MASK_ENTRY]], <16 x i1> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-UF4-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i1> [[ACTIVE_LANE_MASK_ENTRY]], <16 x i1> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; CHECK-UF4-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i1> [[ACTIVE_LANE_MASK_ENTRY]], <16 x i1> poison, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; CHECK-UF4-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[LD]], i64 0
+; CHECK-UF4-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-UF4-NEXT:    br label [[VECTOR_BODY1:%.*]]
+; CHECK-UF4:       vector.body:
+; CHECK-UF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY1]] ]
+; CHECK-UF4-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[TMP1]], [[ENTRY]] ], [ [[TMP9:%.*]], [[VECTOR_BODY1]] ]
+; CHECK-UF4-NEXT:    [[ACTIVE_LANE_MASK4:%.*]] = phi <4 x i1> [ [[TMP2]], [[ENTRY]] ], [ [[TMP10:%.*]], [[VECTOR_BODY1]] ]
+; CHECK-UF4-NEXT:    [[ACTIVE_LANE_MASK5:%.*]] = phi <4 x i1> [ [[TMP3]], [[ENTRY]] ], [ [[TMP11:%.*]], [[VECTOR_BODY1]] ]
+; CHECK-UF4-NEXT:    [[ACTIVE_LANE_MASK6:%.*]] = phi <4 x i1> [ [[TMP4]], [[ENTRY]] ], [ [[TMP12:%.*]], [[VECTOR_BODY1]] ]
+; CHECK-UF4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i32, ptr [[DST]], i64 [[INDEX]]
+; CHECK-UF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP5]], i64 16
+; CHECK-UF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP5]], i64 32
+; CHECK-UF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP5]], i64 48
+; CHECK-UF4-NEXT:    tail call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP5]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-UF4-NEXT:    tail call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP6]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK4]])
+; CHECK-UF4-NEXT:    tail call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP7]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK5]])
+; CHECK-UF4-NEXT:    tail call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP8]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK6]])
+; CHECK-UF4-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; CHECK-UF4-NEXT:    [[ACTIVE_LANE_MASK_NEXT:%.*]] = tail call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 [[INDEX]], i64 [[TMP0]])
+; CHECK-UF4-NEXT:    [[TMP9]] = shufflevector <16 x i1> [[ACTIVE_LANE_MASK_NEXT]], <16 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-UF4-NEXT:    [[TMP10]] = shufflevector <16 x i1> [[ACTIVE_LANE_MASK_NEXT]], <16 x i1> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-UF4-NEXT:    [[TMP11]] = shufflevector <16 x i1> [[ACTIVE_LANE_MASK_NEXT]], <16 x i1> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; CHECK-UF4-NEXT:    [[TMP12]] = shufflevector <16 x i1> [[ACTIVE_LANE_MASK_NEXT]], <16 x i1> poison, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; CHECK-UF4-NEXT:    [[TMP13:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; CHECK-UF4-NEXT:    br i1 [[TMP13]], label [[VECTOR_BODY1]], label [[FOR_END:%.*]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-UF4:       for.end:
+; CHECK-UF4-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %ld = load i32, ptr %src
+  %arrayidx = getelementptr inbounds i32, ptr %dst, i64 %iv
+  store i32 %ld, ptr %arrayidx
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %n
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+attributes #0 = { nounwind "target-features"="+neon,+sve" }
+
+;.
+; CHECK-UF0: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK-UF0: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-UF0: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+;.
+; CHECK-UF2: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK-UF2: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-UF2: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+;.
+; CHECK-UF4: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK-UF4: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-UF4: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-wide-lane-mask.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-wide-lane-mask.ll
new file mode 100644
index 0000000000000..790b78f9002d2
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-wide-lane-mask.ll
@@ -0,0 +1,335 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S --passes="default<O3>" -prefer-predicate-over-epilogue=predicate-dont-vectorize -enable-wide-lane-mask -force-vector-interleave=0 < %s | FileCheck %s -check-prefix CHECK-SVE-UF0
+; RUN: opt -S --passes="default<O3>" -prefer-predicate-over-epilogue=predicate-dont-vectorize -enable-wide-lane-mask -force-vector-interleave=2 < %s | FileCheck %s -check-prefix CHECK-SVE-UF2
+; RUN: opt -S --passes="default<O3>" -prefer-predicate-over-epilogue=predicate-dont-vectorize -enable-wide-lane-mask -force-vector-interleave=4 < %s | FileCheck %s -check-prefix CHECK-SVE-UF4
+
+target triple = "aarch64-unknown-linux"
+
+define void @scalable_wide_active_lane_mask(ptr noalias %dst, ptr readonly %src, i64 %n) #0 {
+; CHECK-SVE-UF0-LABEL: define void @scalable_wide_active_lane_mask(
+; CHECK-SVE-UF0-SAME: ptr noalias writeonly captures(none) [[DST:%.*]], ptr readonly captures(none) [[SRC:%.*]], i64 [[N:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-SVE-UF0-NEXT:  entry:
+; CHECK-SVE-UF0-NEXT:    [[CMP7:%.*]] = icmp sgt i64 [[N]], 0
+; CHECK-SVE-UF0-NEXT:    br i1 [[CMP7]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK-SVE-UF0:       vector.ph:
+; CHECK-SVE-UF0-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF0-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+; CHECK-SVE-UF0-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF0-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 4
+; CHECK-SVE-UF0-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP3]])
+; CHECK-SVE-UF0-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[N]])
+; CHECK-SVE-UF0-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-SVE-UF0:       vector.body:
+; CHECK-SVE-UF0-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-UF0-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-UF0-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-SVE-UF0-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP5]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
+; CHECK-SVE-UF0-NEXT:    [[TMP6:%.*]] = mul <vscale x 16 x i8> [[WIDE_MASKED_LOAD]], splat (i8 3)
+; CHECK-SVE-UF0-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[DST]], i64 [[INDEX]]
+; CHECK-SVE-UF0-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP6]], ptr [[TMP7]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-SVE-UF0-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
+; CHECK-SVE-UF0-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP4]])
+; CHECK-SVE-UF0-NEXT:    [[TMP8:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; CHECK-SVE-UF0-NEXT:    br i1 [[TMP8]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-SVE-UF0:       for.end:
+; CHECK-SVE-UF0-NEXT:    ret void
+;
+; CHECK-SVE-UF2-LABEL: define void @scalable_wide_active_lane_mask(
+; CHECK-SVE-UF2-SAME: ptr noalias writeonly captures(none) [[DST:%.*]], ptr readonly captures(none) [[SRC:%.*]], i64 [[N:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-SVE-UF2-NEXT:  entry:
+; CHECK-SVE-UF2-NEXT:    [[CMP7:%.*]] = icmp sgt i64 [[N]], 0
+; CHECK-SVE-UF2-NEXT:    br i1 [[CMP7]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK-SVE-UF2:       vector.ph:
+; CHECK-SVE-UF2-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF2-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 5
+; CHECK-SVE-UF2-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF2-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 5
+; CHECK-SVE-UF2-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP3]])
+; CHECK-SVE-UF2-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 0, i64 [[N]])
+; CHECK-SVE-UF2-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 0)
+; CHECK-SVE-UF2-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 16)
+; CHECK-SVE-UF2-NEXT:    [[TMP7:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF2-NEXT:    [[TMP8:%.*]] = shl nuw nsw i64 [[TMP7]], 4
+; CHECK-SVE-UF2-NEXT:    [[TMP9:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF2-NEXT:    [[TMP10:%.*]] = shl nuw nsw i64 [[TMP9]], 4
+; CHECK-SVE-UF2-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-SVE-UF2:       vector.body:
+; CHECK-SVE-UF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-UF2-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[TMP5]], [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-UF2-NEXT:    [[ACTIVE_LANE_MASK2:%.*]] = phi <vscale x 16 x i1> [ [[TMP6]], [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-UF2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-SVE-UF2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP11]], i64 [[TMP8]]
+; CHECK-SVE-UF2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP11]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
+; CHECK-SVE-UF2-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr nonnull [[TMP12]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 16 x i8> poison)
+; CHECK-SVE-UF2-NEXT:    [[TMP13:%.*]] = mul <vscale x 16 x i8> [[WIDE_MASKED_LOAD]], splat (i8 3)
+; CHECK-SVE-UF2-NEXT:    [[TMP14:%.*]] = mul <vscale x 16 x i8> [[WIDE_MASKED_LOAD3]], splat (i8 3)
+; CHECK-SVE-UF2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw i8, ptr [[DST]], i64 [[INDEX]]
+; CHECK-SVE-UF2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP15]], i64 [[TMP10]]
+; CHECK-SVE-UF2-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP13]], ptr [[TMP15]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-SVE-UF2-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP14]], ptr [[TMP16]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK2]])
+; CHECK-SVE-UF2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
+; CHECK-SVE-UF2-NEXT:    [[ACTIVE_LANE_MASK_NEXT:%.*]] = tail call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 [[INDEX]], i64 [[TMP4]])
+; CHECK-SVE-UF2-NEXT:    [[TMP17]] = tail call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0)
+; CHECK-SVE-UF2-NEXT:    [[TMP18]] = tail call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 16)
+; CHECK-SVE-UF2-NEXT:    [[TMP19:%.*]] = extractelement <vscale x 16 x i1> [[TMP17]], i64 0
+; CHECK-SVE-UF2-NEXT:    br i1 [[TMP19]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-SVE-UF2:       for.end:
+; CHECK-SVE-UF2-NEXT:    ret void
+;
+; CHECK-SVE-UF4-LABEL: define void @scalable_wide_active_lane_mask(
+; CHECK-SVE-UF4-SAME: ptr noalias writeonly captures(none) [[DST:%.*]], ptr readonly captures(none) [[SRC:%.*]], i64 [[N:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-SVE-UF4-NEXT:  entry:
+; CHECK-SVE-UF4-NEXT:    [[CMP7:%.*]] = icmp sgt i64 [[N]], 0
+; CHECK-SVE-UF4-NEXT:    br i1 [[CMP7]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK-SVE-UF4:       vector.ph:
+; CHECK-SVE-UF4-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF4-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 6
+; CHECK-SVE-UF4-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF4-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 6
+; CHECK-SVE-UF4-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP3]])
+; CHECK-SVE-UF4-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 64 x i1> @llvm.get.active.lane.mask.nxv64i1.i64(i64 0, i64 [[N]])
+; CHECK-SVE-UF4-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv64i1(<vscale x 64 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 0)
+; CHECK-SVE-UF4-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv64i1(<vscale x 64 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 16)
+; CHECK-SVE-UF4-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv64i1(<vscale x 64 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 32)
+; CHECK-SVE-UF4-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv64i1(<vscale x 64 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 48)
+; CHECK-SVE-UF4-NEXT:    [[TMP9:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF4-NEXT:    [[TMP10:%.*]] = shl nuw nsw i64 [[TMP9]], 4
+; CHECK-SVE-UF4-NEXT:    [[TMP11:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF4-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 5
+; CHECK-SVE-UF4-NEXT:    [[TMP13:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF4-NEXT:    [[TMP14:%.*]] = mul nuw nsw i64 [[TMP13]], 48
+; CHECK-SVE-UF4-NEXT:    [[TMP15:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF4-NEXT:    [[TMP16:%.*]] = shl nuw nsw i64 [[TMP15]], 4
+; CHECK-SVE-UF4-NEXT:    [[TMP17:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF4-NEXT:    [[TMP18:%.*]] = shl nuw nsw i64 [[TMP17]], 5
+; CHECK-SVE-UF4-NEXT:    [[TMP19:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF4-NEXT:    [[TMP20:%.*]] = mul nuw nsw i64 [[TMP19]], 48
+; CHECK-SVE-UF4-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-SVE-UF4:       vector.body:
+; CHECK-SVE-UF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-UF4-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[TMP5]], [[VECTOR_PH]] ], [ [[TMP33:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-UF4-NEXT:    [[ACTIVE_LANE_MASK6:%.*]] = phi <vscale x 16 x i1> [ [[TMP6]], [[VECTOR_PH]] ], [ [[TMP34:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-UF4-NEXT:    [[ACTIVE_LANE_MASK7:%.*]] = phi <vscale x 16 x i1> [ [[TMP7]], [[VECTOR_PH]] ], [ [[TMP35:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-UF4-NEXT:    [[ACTIVE_LANE_MASK8:%.*]] = phi <vscale x 16 x i1> [ [[TMP8]], [[VECTOR_PH]] ], [ [[TMP36:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-UF4-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw i8, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-SVE-UF4-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP21]], i64 [[TMP10]]
+; CHECK-SVE-UF4-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP21]], i64 [[TMP12]]
+; CHECK-SVE-UF4-NEXT:    [[TMP24:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP21]], i64 [[TMP14]]
+; CHECK-SVE-UF4-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP21]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
+; CHECK-SVE-UF4-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr nonnull [[TMP22]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 16 x i8> poison)
+; CHECK-SVE-UF4-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr nonnull [[TMP23]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 16 x i8> poison)
+; CHECK-SVE-UF4-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr nonnull [[TMP24]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 16 x i8> poison)
+; CHECK-SVE-UF4-NEXT:    [[TMP25:%.*]] = mul <vscale x 16 x i8> [[WIDE_MASKED_LOAD]], splat (i8 3)
+; CHECK-SVE-UF4-NEXT:    [[TMP26:%.*]] = mul <vscale x 16 x i8> [[WIDE_MASKED_LOAD9]], splat (i8 3)
+; CHECK-SVE-UF4-NEXT:    [[TMP27:%.*]] = mul <vscale x 16 x i8> [[WIDE_MASKED_LOAD10]], splat (i8 3)
+; CHECK-SVE-UF4-NEXT:    [[TMP28:%.*]] = mul <vscale x 16 x i8> [[WIDE_MASKED_LOAD11]], splat (i8 3)
+; CHECK-SVE-UF4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds nuw i8, ptr [[DST]], i64 [[INDEX]]
+; CHECK-SVE-UF4-NEXT:    [[TMP30:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP29]], i64 [[TMP16]]
+; CHECK-SVE-UF4-NEXT:    [[TMP31:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP29]], i64 [[TMP18]]
+; CHECK-SVE-UF4-NEXT:    [[TMP32:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP29]], i64 [[TMP20]]
+; CHECK-SVE-UF4-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP25]], ptr [[TMP29]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-SVE-UF4-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP26]], ptr [[TMP30]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK6]])
+; CHECK-SVE-UF4-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP27]], ptr [[TMP31]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK7]])
+; CHECK-SVE-UF4-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP28]], ptr [[TMP32]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK8]])
+; CHECK-SVE-UF4-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
+; CHECK-SVE-UF4-NEXT:    [[ACTIVE_LANE_MASK_NEXT:%.*]] = tail call <vscale x 64 x i1> @llvm.get.active.lane.mask.nxv64i1.i64(i64 [[INDEX]], i64 [[TMP4]])
+; CHECK-SVE-UF4-NEXT:    [[TMP33]] = tail call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv64i1(<vscale x 64 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0)
+; CHECK-SVE-UF4-NEXT:    [[TMP34]] = tail call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv64i1(<vscale x 64 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 16)
+; CHECK-SVE-UF4-NEXT:    [[TMP35]] = tail call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv64i1(<vscale x 64 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 32)
+; CHECK-SVE-UF4-NEXT:    [[TMP36]] = tail call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv64i1(<vscale x 64 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 48)
+; CHECK-SVE-UF4-NEXT:    [[TMP37:%.*]] = extractelement <vscale x 16 x i1> [[TMP33]], i64 0
+; CHECK-SVE-UF4-NEXT:    br i1 [[TMP37]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-SVE-UF4:       for.end:
+; CHECK-SVE-UF4-NEXT:    ret void
+;
+entry:
+  %cmp = icmp sgt i64 %n, 0
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx1 = getelementptr inbounds i8, ptr %src, i64 %iv
+  %ld = load i8, ptr %arrayidx1
+  %mul = mul i8 %ld, 3
+  %arrayidx2 = getelementptr inbounds i8, ptr %dst, i64 %iv
+  store i8 %mul, ptr %arrayidx2
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %n
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @scalable_wide_active_lane_mask_float(ptr noalias %dst, ptr readonly %src, i32 %n) #0 {
+; CHECK-SVE-UF0-LABEL: define void @scalable_wide_active_lane_mask_float(
+; CHECK-SVE-UF0-SAME: ptr noalias writeonly captures(none) [[DST:%.*]], ptr readonly captures(none) [[SRC:%.*]], i32 [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-SVE-UF0-NEXT:  entry:
+; CHECK-SVE-UF0-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-SVE-UF0-NEXT:    br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK-SVE-UF0:       for.body.preheader:
+; CHECK-SVE-UF0-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
+; CHECK-SVE-UF0-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF0-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 1
+; CHECK-SVE-UF0-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
+; CHECK-SVE-UF0-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-SVE-UF0:       vector.body:
+; CHECK-SVE-UF0-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-UF0-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-UF0-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw double, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-SVE-UF0-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP2]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
+; CHECK-SVE-UF0-NEXT:    [[TMP3:%.*]] = fmul <vscale x 2 x double> [[WIDE_MASKED_LOAD]], splat (double 3.000000e+00)
+; CHECK-SVE-UF0-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw double, ptr [[DST]], i64 [[INDEX]]
+; CHECK-SVE-UF0-NEXT:    tail call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP3]], ptr [[TMP4]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-SVE-UF0-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
+; CHECK-SVE-UF0-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
+; CHECK-SVE-UF0-NEXT:    [[TMP5:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; CHECK-SVE-UF0-NEXT:    br i1 [[TMP5]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-SVE-UF0:       for.end:
+; CHECK-SVE-UF0-NEXT:    ret void
+;
+; CHECK-SVE-UF2-LABEL: define void @scalable_wide_active_lane_mask_float(
+; CHECK-SVE-UF2-SAME: ptr noalias writeonly captures(none) [[DST:%.*]], ptr readonly captures(none) [[SRC:%.*]], i32 [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-SVE-UF2-NEXT:  entry:
+; CHECK-SVE-UF2-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-SVE-UF2-NEXT:    br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK-SVE-UF2:       for.body.preheader:
+; CHECK-SVE-UF2-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
+; CHECK-SVE-UF2-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF2-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
+; CHECK-SVE-UF2-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
+; CHECK-SVE-UF2-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 0)
+; CHECK-SVE-UF2-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 2)
+; CHECK-SVE-UF2-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF2-NEXT:    [[DOTIDX:%.*]] = shl nuw nsw i64 [[TMP4]], 4
+; CHECK-SVE-UF2-NEXT:    [[TMP5:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF2-NEXT:    [[DOTIDX5:%.*]] = shl nuw nsw i64 [[TMP5]], 4
+; CHECK-SVE-UF2-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-SVE-UF2:       vector.body:
+; CHECK-SVE-UF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-UF2-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[TMP2]], [[FOR_BODY_PREHEADER]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-UF2-NEXT:    [[ACTIVE_LANE_MASK2:%.*]] = phi <vscale x 2 x i1> [ [[TMP3]], [[FOR_BODY_PREHEADER]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-UF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw double, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-SVE-UF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP6]], i64 [[DOTIDX]]
+; CHECK-SVE-UF2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP6]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
+; CHECK-SVE-UF2-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = tail call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr nonnull [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 2 x double> poison)
+; CHECK-SVE-UF2-NEXT:    [[TMP8:%.*]] = fmul <vscale x 2 x double> [[WIDE_MASKED_LOAD]], splat (double 3.000000e+00)
+; CHECK-SVE-UF2-NEXT:    [[TMP9:%.*]] = fmul <vscale x 2 x double> [[WIDE_MASKED_LOAD3]], splat (double 3.000000e+00)
+; CHECK-SVE-UF2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw double, ptr [[DST]], i64 [[INDEX]]
+; CHECK-SVE-UF2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP10]], i64 [[DOTIDX5]]
+; CHECK-SVE-UF2-NEXT:    tail call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP8]], ptr [[TMP10]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-SVE-UF2-NEXT:    tail call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP9]], ptr [[TMP11]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
+; CHECK-SVE-UF2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
+; CHECK-SVE-UF2-NEXT:    [[ACTIVE_LANE_MASK_NEXT:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
+; CHECK-SVE-UF2-NEXT:    [[TMP12]] = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0)
+; CHECK-SVE-UF2-NEXT:    [[TMP13]] = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 2)
+; CHECK-SVE-UF2-NEXT:    [[TMP14:%.*]] = extractelement <vscale x 2 x i1> [[TMP12]], i64 0
+; CHECK-SVE-UF2-NEXT:    br i1 [[TMP14]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-SVE-UF2:       for.end:
+; CHECK-SVE-UF2-NEXT:    ret void
+;
+; CHECK-SVE-UF4-LABEL: define void @scalable_wide_active_lane_mask_float(
+; CHECK-SVE-UF4-SAME: ptr noalias writeonly captures(none) [[DST:%.*]], ptr readonly captures(none) [[SRC:%.*]], i32 [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-SVE-UF4-NEXT:  entry:
+; CHECK-SVE-UF4-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-SVE-UF4-NEXT:    br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK-SVE-UF4:       for.body.preheader:
+; CHECK-SVE-UF4-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
+; CHECK-SVE-UF4-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF4-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 3
+; CHECK-SVE-UF4-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
+; CHECK-SVE-UF4-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv8i1(<vscale x 8 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 0)
+; CHECK-SVE-UF4-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv8i1(<vscale x 8 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 2)
+; CHECK-SVE-UF4-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv8i1(<vscale x 8 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 4)
+; CHECK-SVE-UF4-NEXT:    [[TMP5:%.*]] = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv8i1(<vscale x 8 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 6)
+; CHECK-SVE-UF4-NEXT:    [[TMP6:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF4-NEXT:    [[DOTIDX:%.*]] = shl nuw nsw i64 [[TMP6]], 4
+; CHECK-SVE-UF4-NEXT:    [[TMP7:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF4-NEXT:    [[DOTIDX15:%.*]] = shl nuw nsw i64 [[TMP7]], 5
+; CHECK-SVE-UF4-NEXT:    [[TMP8:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF4-NEXT:    [[DOTIDX16:%.*]] = mul nuw nsw i64 [[TMP8]], 48
+; CHECK-SVE-UF4-NEXT:    [[TMP9:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF4-NEXT:    [[DOTIDX17:%.*]] = shl nuw nsw i64 [[TMP9]], 4
+; CHECK-SVE-UF4-NEXT:    [[TMP10:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF4-NEXT:    [[DOTIDX18:%.*]] = shl nuw nsw i64 [[TMP10]], 5
+; CHECK-SVE-UF4-NEXT:    [[TMP11:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-SVE-UF4-NEXT:    [[DOTIDX19:%.*]] = mul nuw nsw i64 [[TMP11]], 48
+; CHECK-SVE-UF4-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-SVE-UF4:       vector.body:
+; CHECK-SVE-UF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-UF4-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[TMP2]], [[FOR_BODY_PREHEADER]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-UF4-NEXT:    [[ACTIVE_LANE_MASK6:%.*]] = phi <vscale x 2 x i1> [ [[TMP3]], [[FOR_BODY_PREHEADER]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-UF4-NEXT:    [[ACTIVE_LANE_MASK7:%.*]] = phi <vscale x 2 x i1> [ [[TMP4]], [[FOR_BODY_PREHEADER]] ], [ [[TMP26:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-UF4-NEXT:    [[ACTIVE_LANE_MASK8:%.*]] = phi <vscale x 2 x i1> [ [[TMP5]], [[FOR_BODY_PREHEADER]] ], [ [[TMP27:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-UF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw double, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-SVE-UF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP12]], i64 [[DOTIDX]]
+; CHECK-SVE-UF4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP12]], i64 [[DOTIDX15]]
+; CHECK-SVE-UF4-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP12]], i64 [[DOTIDX16]]
+; CHECK-SVE-UF4-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP12]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
+; CHECK-SVE-UF4-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = tail call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr nonnull [[TMP13]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 2 x double> poison)
+; CHECK-SVE-UF4-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = tail call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr nonnull [[TMP14]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 2 x double> poison)
+; CHECK-SVE-UF4-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = tail call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr nonnull [[TMP15]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 2 x double> poison)
+; CHECK-SVE-UF4-NEXT:    [[TMP16:%.*]] = fmul <vscale x 2 x double> [[WIDE_MASKED_LOAD]], splat (double 3.000000e+00)
+; CHECK-SVE-UF4-NEXT:    [[TMP17:%.*]] = fmul <vscale x 2 x double> [[WIDE_MASKED_LOAD9]], splat (double 3.000000e+00)
+; CHECK-SVE-UF4-NEXT:    [[TMP18:%.*]] = fmul <vscale x 2 x double> [[WIDE_MASKED_LOAD10]], splat (double 3.000000e+00)
+; CHECK-SVE-UF4-NEXT:    [[TMP19:%.*]] = fmul <vscale x 2 x double> [[WIDE_MASKED_LOAD11]], splat (double 3.000000e+00)
+; CHECK-SVE-UF4-NEXT:    [[TMP20:%.*]] = getelementptr inbounds nuw double, ptr [[DST]], i64 [[INDEX]]
+; CHECK-SVE-UF4-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP20]], i64 [[DOTIDX17]]
+; CHECK-SVE-UF4-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP20]], i64 [[DOTIDX18]]
+; CHECK-SVE-UF4-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP20]], i64 [[DOTIDX19]]
+; CHECK-SVE-UF4-NEXT:    tail call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP16]], ptr [[TMP20]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-SVE-UF4-NEXT:    tail call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP17]], ptr [[TMP21]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK6]])
+; CHECK-SVE-UF4-NEXT:    tail call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP18]], ptr [[TMP22]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK7]])
+; CHECK-SVE-UF4-NEXT:    tail call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP19]], ptr [[TMP23]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK8]])
+; CHECK-SVE-UF4-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
+; CHECK-SVE-UF4-NEXT:    [[ACTIVE_LANE_MASK_NEXT:%.*]] = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
+; CHECK-SVE-UF4-NEXT:    [[TMP24]] = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv8i1(<vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0)
+; CHECK-SVE-UF4-NEXT:    [[TMP25]] = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv8i1(<vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 2)
+; CHECK-SVE-UF4-NEXT:    [[TMP26]] = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv8i1(<vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 4)
+; CHECK-SVE-UF4-NEXT:    [[TMP27]] = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv8i1(<vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 6)
+; CHECK-SVE-UF4-NEXT:    [[TMP28:%.*]] = extractelement <vscale x 2 x i1> [[TMP24]], i64 0
+; CHECK-SVE-UF4-NEXT:    br i1 [[TMP28]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-SVE-UF4:       for.end:
+; CHECK-SVE-UF4-NEXT:    ret void
+;
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body.preheader, label %for.end
+
+for.body.preheader:
+  %wide.trip.count = zext nneg i32 %n to i64
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.body ]
+  %arrayidx1 = getelementptr inbounds double, ptr %src, i64 %iv
+  %ld = load double, ptr %arrayidx1
+  %mul = fmul double %ld, 3.000000e+00
+  %arrayidx2 = getelementptr inbounds double, ptr %dst, i64 %iv
+  store double %mul, ptr %arrayidx2
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+attributes #0 = { nounwind vscale_range(1,16) "target-features"="+sve2p1" }
+
+;.
+; CHECK-SVE-UF0: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK-SVE-UF0: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-SVE-UF0: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-SVE-UF0: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
+;.
+; CHECK-SVE-UF2: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK-SVE-UF2: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-SVE-UF2: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-SVE-UF2: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
+;.
+; CHECK-SVE-UF4: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK-SVE-UF4: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-SVE-UF4: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-SVE-UF4: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/active-lane-mask.ll b/llvm/test/Transforms/LoopVectorize/ARM/active-lane-mask.ll
new file mode 100644
index 0000000000000..d20216654c3b4
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/ARM/active-lane-mask.ll
@@ -0,0 +1,88 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=loop-vectorize -tail-predication=enabled -prefer-predicate-over-epilogue=predicate-dont-vectorize -enable-wide-lane-mask -S | FileCheck %s
+
+target triple = "thumbv8.1m.main-arm-unknown-eabihf"
+
+define void @f0(ptr noalias %dst, ptr readonly %src, i64 %n) #0 {
+; CHECK-LABEL: define void @f0(
+; CHECK-SAME: ptr noalias [[DST:%.*]], ptr readonly [[SRC:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[VAL:%.*]] = icmp sgt i64 [[N]], 0
+; CHECK-NEXT:    br i1 [[VAL]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_END:.*]]
+; CHECK:       [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], 31
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 32
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 16
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 [[INDEX]], i64 [[N]])
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK1:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 [[TMP0]], i64 [[N]])
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP2]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP3]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK1]], <16 x i8> poison)
+; CHECK-NEXT:    [[TMP4:%.*]] = mul <16 x i8> [[WIDE_MASKED_LOAD]], splat (i8 3)
+; CHECK-NEXT:    [[TMP5:%.*]] = mul <16 x i8> [[WIDE_MASKED_LOAD2]], splat (i8 3)
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 16
+; CHECK-NEXT:    call void @llvm.masked.store.v16i8.p0(<16 x i8> [[TMP4]], ptr [[TMP7]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.v16i8.p0(<16 x i8> [[TMP5]], ptr [[TMP8]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK1]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 32
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[FOR_END_LOOPEXIT:.*]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[MUL:%.*]] = mul i8 [[TMP10]], 3
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i8 [[MUL]], ptr [[ARRAYIDX3]], align 1
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[FOR_END_LOOPEXIT]]:
+; CHECK-NEXT:    br label %[[FOR_END]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %val = icmp sgt i64 %n, 0
+  br i1 %val, label %for.body, label %for.end
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i8, ptr %src, i64 %indvars.iv
+  %0 = load i8, ptr %arrayidx, align 1
+  %mul = mul i8 %0, 3
+  %arrayidx3 = getelementptr inbounds i8, ptr %dst, i64 %indvars.iv
+  store i8 %mul, ptr %arrayidx3, align 1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %n
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !1
+
+for.end:
+  ret void
+}
+
+attributes #0 = { nofree norecurse nounwind "target-features"="+armv8.1-m.main,+mve.fp" }
+
+!1 = distinct !{!1, !2, !3}
+!2 = !{!"llvm.loop.vectorize.width", i32 16}
+!3 = !{!"llvm.loop.interleave.count", i32 2}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.