!fixup address comments, thanks!

fhahn · fhahn · commit 3c3f9e4ab79f · 2025-06-17T11:57:22.000+01:00
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2735,10 +2735,10 @@ class VPBundleRecipe : public VPSingleDefRecipe {
         BundleType(BundleType) {
     // Bundle up the operand recipes.
     SmallPtrSet<VPUser *, 4> BundledUsers;
-    for (auto *R : ToBundle)
+    for (auto *R : BundledOps)
       BundledUsers.insert(R);
 
-    // Recipes in the bundle, expect the last one, must only be used inside the
+    // Recipes in the bundle, except the last one, must only be used inside the
     // bundle. If there other external users, clone the recipes for the bundle.
     for (const auto &[Idx, R] : enumerate(drop_end(ToBundle))) {
       if (all_of(R->users(), [&BundledUsers](VPUser *U) {
@@ -2748,22 +2748,22 @@ class VPBundleRecipe : public VPSingleDefRecipe {
           R->removeFromParent();
         continue;
       }
-      // There users external to the bundle. Clone the recipe for use in the
+      // The users external to the bundle. Clone the recipe for use in the
       // bundle and update all its in-bundle users.
-      this->BundledOps[Idx] = R->clone();
-      BundledUsers.insert(this->BundledOps[Idx]);
-      R->replaceUsesWithIf(this->BundledOps[Idx],
-                           [&BundledUsers](VPUser &U, unsigned) {
-                             return BundledUsers.contains(&U);
-                           });
+      VPSingleDefRecipe *Copy = R->clone();
+      BundledOps[Idx] = Copy;
+      BundledUsers.insert(Copy);
+      R->replaceUsesWithIf(Copy, [&BundledUsers](VPUser &U, unsigned) {
+        return BundledUsers.contains(&U);
+      });
     }
     BundledOps.back()->removeFromParent();
 
     // Internalize all external operands to the bundled operations. To do so,
     // create new temporary VPValues for all operands not defined by recipe in
     // the bundle. The original operands are added as operands of the
     // VPBundleRecipe.
-    for (auto *R : this->BundledOps) {
+    for (auto *R : BundledOps) {
       for (const auto &[Idx, Op] : enumerate(R->operands())) {
         auto *Def = Op->getDefiningRecipe();
         if (Def && BundledUsers.contains(Def))
@@ -2802,13 +2802,17 @@ class VPBundleRecipe : public VPSingleDefRecipe {
   VP_CLASSOF_IMPL(VPDef::VPBundleSC)
 
   VPBundleRecipe *clone() override {
+    assert(!BundledOps.empty() && "empty bundles should be removed");
     return new VPBundleRecipe(BundleType, BundledOps);
   }
 
   /// Return the VPSingleDefRecipe producing the final result of the bundled
   /// recipe.
   VPSingleDefRecipe *getResultOp() const { return BundledOps.back(); }
 
+  /// Insert the bundled recipes back into the VPlan, directly before the
+  /// current recipe. Leaves the bundle recipe empty and the recipe must be
+  /// removed before codegen.
   void unbundle();
 
   /// Generate the extraction of the appropriate bit from the block mask and the
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -268,7 +268,11 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
   Type *ResultTy =
       TypeSwitch<const VPRecipeBase *, Type *>(V->getDefiningRecipe())
           .Case<VPBundleRecipe>([this](const auto *R) {
-            return inferScalarType(R->getOperand(R->getNumOperands() - 2));
+            unsigned RdxOpIdxOffset =
+                cast<VPReductionRecipe>(R->getResultOp())->isConditional() ? 2
+                                                                           : 1;
+            return inferScalarType(
+                R->getOperand(R->getNumOperands() - RdxOpIdxOffset));
           })
           .Case<VPActiveLaneMaskPHIRecipe, VPCanonicalIVPHIRecipe,
                 VPFirstOrderRecurrencePHIRecipe, VPReductionPHIRecipe,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2590,6 +2590,7 @@ void VPBundleRecipe::unbundle() {
       BundledOps.size() == 5) {
     // Note that we will drop the extend after mul which transforms
     // reduce.add(ext(mul(ext, ext))) to reduce.add(mul(ext, ext)).
+    // TODO: This transform should be done separately from bundling/unbundling.
     auto *Ext0 = cast<VPWidenCastRecipe>(BundledOps[0]);
     auto *Ext1 = cast<VPWidenCastRecipe>(BundledOps[1]);
     auto *Ext2 = cast<VPWidenCastRecipe>(BundledOps[3]);
@@ -2602,7 +2603,7 @@ void VPBundleRecipe::unbundle() {
     if (Ext0 != Ext1) {
       Op1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(0),
                                   Ext2->getResultType(), *Ext1, getDebugLoc());
-      Op1->insertBefore(Ext0);
+      Op1->insertBefore(Ext1);
     }
     auto *Mul = cast<VPWidenRecipe>(BundledOps[2]);
     auto *Red = cast<VPReductionRecipe>(BundledOps[4]);
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll
@@ -115,20 +115,20 @@ define i64 @add_i16_i64(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       for.body.preheader:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 8
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[N]], 2147483640
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[N]], 2147483644
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP0]], align 2
-; CHECK-NEXT:    [[TMP1:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP1]])
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP0]], align 2
+; CHECK-NEXT:    [[TMP1:%.*]] = sext <4 x i16> [[WIDE_LOAD]] to <4 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP1]])
 ; CHECK-NEXT:    [[TMP3]] = add i64 [[TMP2]], [[VEC_PHI]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
@@ -180,20 +180,20 @@ define i64 @add_i8_i64(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       for.body.preheader:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 16
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[N]], 2147483632
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[N]], 2147483644
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[X:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1
-; CHECK-NEXT:    [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64>
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> [[TMP1]])
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP0]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = zext <4 x i8> [[WIDE_LOAD]] to <4 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP1]])
 ; CHECK-NEXT:    [[TMP3]] = add i64 [[TMP2]], [[VEC_PHI]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block: