Skip to content

Commit 4a2fa08

Browse files
authored
[VPlan] Support VPWidenIntOrFpInductionRecipes with EVL tail folding (#144666)
Following on from #118638, this handles widened induction variables with EVL tail folding by setting the VF operand to be EVL, calculated in the vector body. We need to do this for correctness since with EVL tail folding the number of elements processed in the penultimate iteration may not be VF, but the runtime EVL, and we need take this into account when updating the backedge value. - Because the VF may now not be a live-in we need to move the insertion point to just after the VFs definition - We also need to avoid truncating it when it's the same size as the step type, previously this wasn't a problem for live-ins. - Also because the VF may be smaller than the IV type, since the EVL is always i32, we may need to zext it. On -march=rva23u64 -O3 we get 87.1% more loops vectorized on TSVC, and 42.8% more loops vectorized on SPEC CPU 2017
1 parent a6339d0 commit 4a2fa08

9 files changed

+616
-217
lines changed

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2199,7 +2199,8 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
21992199
VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
22002200

22012201
assert(all_of(Plan.getVF().users(),
2202-
IsaPred<VPVectorEndPointerRecipe, VPScalarIVStepsRecipe>) &&
2202+
IsaPred<VPVectorEndPointerRecipe, VPScalarIVStepsRecipe,
2203+
VPWidenIntOrFpInductionRecipe>) &&
22032204
"User of VF that we can't transform to EVL.");
22042205
Plan.getVF().replaceAllUsesWith(&EVL);
22052206

@@ -2300,12 +2301,11 @@ bool VPlanTransforms::tryAddExplicitVectorLength(
23002301
VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
23012302
VPBasicBlock *Header = Plan.getVectorLoopRegion()->getEntryBasicBlock();
23022303
// The transform updates all users of inductions to work based on EVL, instead
2303-
// of the VF directly. At the moment, widened inductions cannot be updated, so
2304-
// bail out if the plan contains any.
2305-
bool ContainsWidenInductions = any_of(
2306-
Header->phis(),
2307-
IsaPred<VPWidenIntOrFpInductionRecipe, VPWidenPointerInductionRecipe>);
2308-
if (ContainsWidenInductions)
2304+
// of the VF directly. At the moment, widened pointer inductions cannot be
2305+
// updated, so bail out if the plan contains any.
2306+
bool ContainsWidenPointerInductions =
2307+
any_of(Header->phis(), IsaPred<VPWidenPointerInductionRecipe>);
2308+
if (ContainsWidenPointerInductions)
23092309
return false;
23102310

23112311
auto *CanonicalIVPHI = Plan.getCanonicalIV();
@@ -2627,6 +2627,8 @@ expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR,
26272627
Inc = SplatVF;
26282628
Prev = WidenIVR->getLastUnrolledPartOperand();
26292629
} else {
2630+
if (VPRecipeBase *R = VF->getDefiningRecipe())
2631+
Builder.setInsertPoint(R->getParent(), std::next(R->getIterator()));
26302632
// Multiply the vectorization factor by the step using integer or
26312633
// floating-point arithmetic as appropriate.
26322634
if (StepTy->isFloatingPointTy())

llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,8 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const {
156156
.Case<VPWidenIntrinsicRecipe>([&](const VPWidenIntrinsicRecipe *S) {
157157
return VerifyEVLUse(*S, S->getNumOperands() - 1);
158158
})
159-
.Case<VPWidenStoreEVLRecipe, VPReductionEVLRecipe>(
159+
.Case<VPWidenStoreEVLRecipe, VPReductionEVLRecipe,
160+
VPWidenIntOrFpInductionRecipe>(
160161
[&](const VPRecipeBase *S) { return VerifyEVLUse(*S, 2); })
161162
.Case<VPScalarIVStepsRecipe>([&](auto *R) {
162163
if (R->getNumOperands() != 3) {
@@ -172,13 +173,27 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const {
172173
.Case<VPInstruction>([&](const VPInstruction *I) {
173174
if (I->getOpcode() == Instruction::PHI)
174175
return VerifyEVLUse(*I, 1);
175-
if (I->getOpcode() != Instruction::Add) {
176-
errs() << "EVL is used as an operand in non-VPInstruction::Add\n";
176+
switch (I->getOpcode()) {
177+
case Instruction::Add:
178+
break;
179+
case Instruction::UIToFP:
180+
case Instruction::Trunc:
181+
case Instruction::ZExt:
182+
case Instruction::Mul:
183+
case Instruction::FMul:
184+
// Opcodes above can only use EVL after wide inductions have been
185+
// expanded.
186+
if (!VerifyLate) {
187+
errs() << "EVL used by unexpected VPInstruction\n";
188+
return false;
189+
}
190+
break;
191+
default:
192+
errs() << "EVL used by unexpected VPInstruction\n";
177193
return false;
178194
}
179195
if (I->getNumUsers() != 1) {
180-
errs() << "EVL is used in VPInstruction:Add with multiple "
181-
"users\n";
196+
errs() << "EVL is used in VPInstruction with multiple users\n";
182197
return false;
183198
}
184199
if (!VerifyLate && !isa<VPEVLBasedIVPHIRecipe>(*I->users().begin())) {

llvm/test/Transforms/LoopVectorize/RISCV/evl-compatible-loops.ll

Lines changed: 50 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,51 @@ define void @test_wide_integer_induction(ptr noalias %a, i64 %N) {
88
; CHECK-LABEL: define void @test_wide_integer_induction(
99
; CHECK-SAME: ptr noalias [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
1010
; CHECK-NEXT: entry:
11+
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
12+
; CHECK: vector.ph:
13+
; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
14+
; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2
15+
; CHECK-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1
16+
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
17+
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
18+
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
19+
; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
20+
; CHECK-NEXT: [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 2
21+
; CHECK-NEXT: [[TMP9:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
22+
; CHECK-NEXT: [[TMP10:%.*]] = mul <vscale x 2 x i64> [[TMP9]], splat (i64 1)
23+
; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP10]]
1124
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
25+
; CHECK: vector.body:
26+
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
27+
; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDEX_EVL_NEXT:%.*]], [[FOR_BODY]] ]
28+
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR_BODY]] ]
29+
; CHECK-NEXT: [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
30+
; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
31+
; CHECK-NEXT: [[TMP12:%.*]] = zext i32 [[TMP11]] to i64
32+
; CHECK-NEXT: [[TMP13:%.*]] = mul i64 1, [[TMP12]]
33+
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP13]], i64 0
34+
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
35+
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[EVL_BASED_IV]]
36+
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[TMP14]], i32 0
37+
; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VEC_IND]], ptr align 8 [[TMP15]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP11]])
38+
; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP11]] to i64
39+
; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]]
40+
; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], [[TMP8]]
41+
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
42+
; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]]
43+
; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
44+
; CHECK: middle.block:
45+
; CHECK-NEXT: br label [[FOR_COND_CLEANUP:%.*]]
46+
; CHECK: scalar.ph:
47+
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY1:%.*]] ]
48+
; CHECK-NEXT: br label [[FOR_BODY1:%.*]]
1249
; CHECK: for.body:
13-
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
14-
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
15-
; CHECK-NEXT: store i64 [[IV]], ptr [[ARRAYIDX]], align 8
16-
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
17-
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
18-
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
50+
; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY1]] ]
51+
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV1]]
52+
; CHECK-NEXT: store i64 [[IV1]], ptr [[ARRAYIDX]], align 8
53+
; CHECK-NEXT: [[IV_NEXT1]] = add nuw nsw i64 [[IV1]], 1
54+
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N]]
55+
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY1]], !llvm.loop [[LOOP4:![0-9]+]]
1956
; CHECK: for.cond.cleanup:
2057
; CHECK-NEXT: ret void
2158
;
@@ -68,3 +105,10 @@ for.body:
68105
for.cond.cleanup:
69106
ret void
70107
}
108+
;.
109+
; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]], [[META3:![0-9]+]]}
110+
; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
111+
; CHECK: [[META2]] = !{!"llvm.loop.isvectorized.tailfoldingstyle", !"evl"}
112+
; CHECK: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"}
113+
; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META3]], [[META1]]}
114+
;.

0 commit comments

Comments
 (0)