Skip to content

Commit 3c3f9e4

Browse files
committed
!fixup address comments, thanks!
1 parent 736357e commit 3c3f9e4

File tree

4 files changed

+33
-24
lines changed

4 files changed

+33
-24
lines changed

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2735,10 +2735,10 @@ class VPBundleRecipe : public VPSingleDefRecipe {
27352735
BundleType(BundleType) {
27362736
// Bundle up the operand recipes.
27372737
SmallPtrSet<VPUser *, 4> BundledUsers;
2738-
for (auto *R : ToBundle)
2738+
for (auto *R : BundledOps)
27392739
BundledUsers.insert(R);
27402740

2741-
// Recipes in the bundle, expect the last one, must only be used inside the
2741+
// Recipes in the bundle, except the last one, must only be used inside the
27422742
// bundle. If there other external users, clone the recipes for the bundle.
27432743
for (const auto &[Idx, R] : enumerate(drop_end(ToBundle))) {
27442744
if (all_of(R->users(), [&BundledUsers](VPUser *U) {
@@ -2748,22 +2748,22 @@ class VPBundleRecipe : public VPSingleDefRecipe {
27482748
R->removeFromParent();
27492749
continue;
27502750
}
2751-
// There users external to the bundle. Clone the recipe for use in the
2751+
// The users external to the bundle. Clone the recipe for use in the
27522752
// bundle and update all its in-bundle users.
2753-
this->BundledOps[Idx] = R->clone();
2754-
BundledUsers.insert(this->BundledOps[Idx]);
2755-
R->replaceUsesWithIf(this->BundledOps[Idx],
2756-
[&BundledUsers](VPUser &U, unsigned) {
2757-
return BundledUsers.contains(&U);
2758-
});
2753+
VPSingleDefRecipe *Copy = R->clone();
2754+
BundledOps[Idx] = Copy;
2755+
BundledUsers.insert(Copy);
2756+
R->replaceUsesWithIf(Copy, [&BundledUsers](VPUser &U, unsigned) {
2757+
return BundledUsers.contains(&U);
2758+
});
27592759
}
27602760
BundledOps.back()->removeFromParent();
27612761

27622762
// Internalize all external operands to the bundled operations. To do so,
27632763
// create new temporary VPValues for all operands not defined by recipe in
27642764
// the bundle. The original operands are added as operands of the
27652765
// VPBundleRecipe.
2766-
for (auto *R : this->BundledOps) {
2766+
for (auto *R : BundledOps) {
27672767
for (const auto &[Idx, Op] : enumerate(R->operands())) {
27682768
auto *Def = Op->getDefiningRecipe();
27692769
if (Def && BundledUsers.contains(Def))
@@ -2802,13 +2802,17 @@ class VPBundleRecipe : public VPSingleDefRecipe {
28022802
VP_CLASSOF_IMPL(VPDef::VPBundleSC)
28032803

28042804
VPBundleRecipe *clone() override {
2805+
assert(!BundledOps.empty() && "empty bundles should be removed");
28052806
return new VPBundleRecipe(BundleType, BundledOps);
28062807
}
28072808

28082809
/// Return the VPSingleDefRecipe producing the final result of the bundled
28092810
/// recipe.
28102811
VPSingleDefRecipe *getResultOp() const { return BundledOps.back(); }
28112812

2813+
/// Insert the bundled recipes back into the VPlan, directly before the
2814+
/// current recipe. Leaves the bundle recipe empty and the recipe must be
2815+
/// removed before codegen.
28122816
void unbundle();
28132817

28142818
/// Generate the extraction of the appropriate bit from the block mask and the

llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -268,7 +268,11 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
268268
Type *ResultTy =
269269
TypeSwitch<const VPRecipeBase *, Type *>(V->getDefiningRecipe())
270270
.Case<VPBundleRecipe>([this](const auto *R) {
271-
return inferScalarType(R->getOperand(R->getNumOperands() - 2));
271+
unsigned RdxOpIdxOffset =
272+
cast<VPReductionRecipe>(R->getResultOp())->isConditional() ? 2
273+
: 1;
274+
return inferScalarType(
275+
R->getOperand(R->getNumOperands() - RdxOpIdxOffset));
272276
})
273277
.Case<VPActiveLaneMaskPHIRecipe, VPCanonicalIVPHIRecipe,
274278
VPFirstOrderRecurrencePHIRecipe, VPReductionPHIRecipe,

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2590,6 +2590,7 @@ void VPBundleRecipe::unbundle() {
25902590
BundledOps.size() == 5) {
25912591
// Note that we will drop the extend after mul which transforms
25922592
// reduce.add(ext(mul(ext, ext))) to reduce.add(mul(ext, ext)).
2593+
// TODO: This transform should be done separately from bundling/unbundling.
25932594
auto *Ext0 = cast<VPWidenCastRecipe>(BundledOps[0]);
25942595
auto *Ext1 = cast<VPWidenCastRecipe>(BundledOps[1]);
25952596
auto *Ext2 = cast<VPWidenCastRecipe>(BundledOps[3]);
@@ -2602,7 +2603,7 @@ void VPBundleRecipe::unbundle() {
26022603
if (Ext0 != Ext1) {
26032604
Op1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(0),
26042605
Ext2->getResultType(), *Ext1, getDebugLoc());
2605-
Op1->insertBefore(Ext0);
2606+
Op1->insertBefore(Ext1);
26062607
}
26072608
auto *Mul = cast<VPWidenRecipe>(BundledOps[2]);
26082609
auto *Red = cast<VPReductionRecipe>(BundledOps[4]);

llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -115,20 +115,20 @@ define i64 @add_i16_i64(ptr nocapture readonly %x, i32 %n) #0 {
115115
; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
116116
; CHECK-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
117117
; CHECK: for.body.preheader:
118-
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 8
118+
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
119119
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
120120
; CHECK: vector.ph:
121-
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], 2147483640
121+
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], 2147483644
122122
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
123123
; CHECK: vector.body:
124124
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
125125
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
126126
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[INDEX]]
127-
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP0]], align 2
128-
; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
129-
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP1]])
127+
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP0]], align 2
128+
; CHECK-NEXT: [[TMP1:%.*]] = sext <4 x i16> [[WIDE_LOAD]] to <4 x i64>
129+
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP1]])
130130
; CHECK-NEXT: [[TMP3]] = add i64 [[TMP2]], [[VEC_PHI]]
131-
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
131+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
132132
; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
133133
; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
134134
; CHECK: middle.block:
@@ -180,20 +180,20 @@ define i64 @add_i8_i64(ptr nocapture readonly %x, i32 %n) #0 {
180180
; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
181181
; CHECK-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
182182
; CHECK: for.body.preheader:
183-
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 16
183+
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
184184
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
185185
; CHECK: vector.ph:
186-
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], 2147483632
186+
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], 2147483644
187187
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
188188
; CHECK: vector.body:
189189
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
190190
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
191191
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[X:%.*]], i32 [[INDEX]]
192-
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1
193-
; CHECK-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64>
194-
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> [[TMP1]])
192+
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP0]], align 1
193+
; CHECK-NEXT: [[TMP1:%.*]] = zext <4 x i8> [[WIDE_LOAD]] to <4 x i64>
194+
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP1]])
195195
; CHECK-NEXT: [[TMP3]] = add i64 [[TMP2]], [[VEC_PHI]]
196-
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
196+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
197197
; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
198198
; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
199199
; CHECK: middle.block:

0 commit comments

Comments
 (0)