Skip to content

Commit a75e062

Browse files
authored
[LV] Use vscale for tuning when updating profile information (#143690)
In fixVectorizedLoop we call setProfileInfoAfterUnrolling to update the profile information after vectorising, however for scalable VFs we pessimistically assume vscale=1. We can improve upon this by using the value of vscale used for tuning, i.e. when targeting neoverse-v1 the expected value is 2.
1 parent 3dd61c1 commit a75e062

File tree

3 files changed

+169
-17
lines changed

3 files changed

+169
-17
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 18 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -2691,6 +2691,20 @@ static void cse(BasicBlock *BB) {
26912691
}
26922692
}
26932693

2694+
/// This function attempts to return a value that represents the vectorization
2695+
/// factor at runtime. For fixed-width VFs we know this precisely at compile
2696+
/// time, but for scalable VFs we calculate it based on an estimate of the
2697+
/// vscale value.
2698+
static unsigned getEstimatedRuntimeVF(ElementCount VF,
2699+
std::optional<unsigned> VScale) {
2700+
unsigned EstimatedVF = VF.getKnownMinValue();
2701+
if (VF.isScalable())
2702+
if (VScale)
2703+
EstimatedVF *= *VScale;
2704+
assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
2705+
return EstimatedVF;
2706+
}
2707+
26942708
InstructionCost
26952709
LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
26962710
ElementCount VF) const {
@@ -2790,10 +2804,11 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
27902804
//
27912805
// For scalable vectorization we can't know at compile time how many
27922806
// iterations of the loop are handled in one vector iteration, so instead
2793-
// assume a pessimistic vscale of '1'.
2807+
// use the value of vscale used for tuning.
27942808
Loop *VectorLoop = LI->getLoopFor(HeaderBB);
2795-
setProfileInfoAfterUnrolling(OrigLoop, VectorLoop, OrigLoop,
2796-
VF.getKnownMinValue() * UF);
2809+
unsigned EstimatedVFxUF =
2810+
getEstimatedRuntimeVF(VF * UF, Cost->getVScaleForTuning());
2811+
setProfileInfoAfterUnrolling(OrigLoop, VectorLoop, OrigLoop, EstimatedVFxUF);
27972812
}
27982813

27992814
void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
@@ -4031,20 +4046,6 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
40314046
return MaxVF;
40324047
}
40334048

4034-
/// This function attempts to return a value that represents the vectorization
4035-
/// factor at runtime. For fixed-width VFs we know this precisely at compile
4036-
/// time, but for scalable VFs we calculate it based on an estimate of the
4037-
/// vscale value.
4038-
static unsigned getEstimatedRuntimeVF(ElementCount VF,
4039-
std::optional<unsigned> VScale) {
4040-
unsigned EstimatedVF = VF.getKnownMinValue();
4041-
if (VF.isScalable())
4042-
if (VScale)
4043-
EstimatedVF *= *VScale;
4044-
assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
4045-
return EstimatedVF;
4046-
}
4047-
40484049
bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
40494050
const VectorizationFactor &B,
40504051
const unsigned MaxTripCount,
Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "br" --filter "^.*:" --version 5
2+
; RUN: opt -passes="print<block-freq>,loop-vectorize" -mcpu=neoverse-v1 -force-vector-interleave=1 -S < %s | FileCheck %s -check-prefix=CHECK-V1-IC1
3+
; RUN: opt -passes="print<block-freq>,loop-vectorize" -mcpu=neoverse-v2 -force-vector-interleave=1 -S < %s | FileCheck %s -check-prefix=CHECK-V2-IC1
4+
; RUN: opt -passes="print<block-freq>,loop-vectorize" -mcpu=neoverse-v2 -force-vector-interleave=4 -S < %s | FileCheck %s -check-prefix=CHECK-V2-IC4
5+
6+
target triple = "aarch64-unknown-linux-gnu"
7+
8+
@a = global [1024 x i32] zeroinitializer, align 16
9+
@b = global [1024 x i32] zeroinitializer, align 16
10+
11+
; We expect the branch weight computations after vectorisation to use
12+
; vscale=2 for neoverse-v1 and vscale=1 for neoverse-v2.
13+
define void @_Z3foov() {
14+
; CHECK-V1-IC1-LABEL: define void @_Z3foov(
15+
; CHECK-V1-IC1-SAME: ) #[[ATTR0:[0-9]+]] {
16+
; CHECK-V1-IC1: [[ENTRY:.*:]]
17+
; CHECK-V1-IC1: br i1 [[MIN_ITERS_CHECK:%.*]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]], !prof [[PROF0:![0-9]+]]
18+
; CHECK-V1-IC1: [[VECTOR_PH]]:
19+
; CHECK-V1-IC1: br label %[[VECTOR_BODY:.*]]
20+
; CHECK-V1-IC1: [[VECTOR_BODY]]:
21+
; CHECK-V1-IC1: br i1 [[TMP10:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF0]], !llvm.loop [[LOOP1:![0-9]+]]
22+
; CHECK-V1-IC1: [[MIDDLE_BLOCK]]:
23+
; CHECK-V1-IC1: br i1 [[CMP_N:%.*]], label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]], !prof [[PROF4:![0-9]+]]
24+
; CHECK-V1-IC1: [[SCALAR_PH]]:
25+
; CHECK-V1-IC1: br label %[[FOR_BODY:.*]]
26+
; CHECK-V1-IC1: [[FOR_BODY]]:
27+
; CHECK-V1-IC1: br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !prof [[PROF5:![0-9]+]], !llvm.loop [[LOOP6:![0-9]+]]
28+
; CHECK-V1-IC1: [[FOR_COND_CLEANUP]]:
29+
;
30+
; CHECK-V2-IC1-LABEL: define void @_Z3foov(
31+
; CHECK-V2-IC1-SAME: ) #[[ATTR0:[0-9]+]] {
32+
; CHECK-V2-IC1: [[ENTRY:.*:]]
33+
; CHECK-V2-IC1: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]], !prof [[PROF0:![0-9]+]]
34+
; CHECK-V2-IC1: [[VECTOR_PH]]:
35+
; CHECK-V2-IC1: br label %[[VECTOR_BODY:.*]]
36+
; CHECK-V2-IC1: [[VECTOR_BODY]]:
37+
; CHECK-V2-IC1: br i1 [[TMP4:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]]
38+
; CHECK-V2-IC1: [[MIDDLE_BLOCK]]:
39+
; CHECK-V2-IC1: br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]], !prof [[PROF5:![0-9]+]]
40+
; CHECK-V2-IC1: [[SCALAR_PH]]:
41+
; CHECK-V2-IC1: br label %[[FOR_BODY:.*]]
42+
; CHECK-V2-IC1: [[FOR_BODY]]:
43+
; CHECK-V2-IC1: br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !prof [[PROF6:![0-9]+]], !llvm.loop [[LOOP7:![0-9]+]]
44+
; CHECK-V2-IC1: [[FOR_COND_CLEANUP]]:
45+
;
46+
; CHECK-V2-IC4-LABEL: define void @_Z3foov(
47+
; CHECK-V2-IC4-SAME: ) #[[ATTR0:[0-9]+]] {
48+
; CHECK-V2-IC4: [[VEC_EPILOG_VECTOR_BODY1:.*:]]
49+
; CHECK-V2-IC4: br i1 [[MIN_ITERS_CHECK:%.*]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]], !prof [[PROF0:![0-9]+]]
50+
; CHECK-V2-IC4: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
51+
; CHECK-V2-IC4: br i1 false, label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]], !prof [[PROF0]]
52+
; CHECK-V2-IC4: [[VECTOR_PH]]:
53+
; CHECK-V2-IC4: br label %[[VECTOR_BODY:.*]]
54+
; CHECK-V2-IC4: [[VECTOR_BODY]]:
55+
; CHECK-V2-IC4: br i1 [[TMP12:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]]
56+
; CHECK-V2-IC4: [[MIDDLE_BLOCK]]:
57+
; CHECK-V2-IC4: br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]], !prof [[PROF5:![0-9]+]]
58+
; CHECK-V2-IC4: [[VEC_EPILOG_ITER_CHECK]]:
59+
; CHECK-V2-IC4: br i1 [[MIN_EPILOG_ITERS_CHECK:%.*]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF6:![0-9]+]]
60+
; CHECK-V2-IC4: [[VEC_EPILOG_PH]]:
61+
; CHECK-V2-IC4: br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
62+
; CHECK-V2-IC4: [[VEC_EPILOG_VECTOR_BODY]]:
63+
; CHECK-V2-IC4: br i1 [[TMP23:%.*]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
64+
; CHECK-V2-IC4: [[VEC_EPILOG_MIDDLE_BLOCK]]:
65+
; CHECK-V2-IC4: br i1 [[CMP_N:%.*]], label %[[FOR_COND_CLEANUP]], label %[[VEC_EPILOG_SCALAR_PH]], !prof [[PROF8:![0-9]+]]
66+
; CHECK-V2-IC4: [[VEC_EPILOG_SCALAR_PH]]:
67+
; CHECK-V2-IC4: br label %[[FOR_BODY:.*]]
68+
; CHECK-V2-IC4: [[FOR_BODY]]:
69+
; CHECK-V2-IC4: br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !prof [[PROF9:![0-9]+]], !llvm.loop [[LOOP10:![0-9]+]]
70+
; CHECK-V2-IC4: [[FOR_COND_CLEANUP]]:
71+
;
72+
entry:
73+
br label %for.body
74+
75+
for.body: ; preds = %for.body, %entry
76+
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
77+
%arrayidx = getelementptr inbounds [1024 x i32], ptr @b, i64 0, i64 %iv
78+
%load = load i32, ptr %arrayidx, align 4
79+
%arrayidx2 = getelementptr inbounds [1024 x i32], ptr @a, i64 0, i64 %iv
80+
store i32 %load, ptr %arrayidx2, align 4
81+
%iv.next = add nuw nsw i64 %iv, 1
82+
%exitcond = icmp eq i64 %iv.next, 1024
83+
br i1 %exitcond, label %for.cond.cleanup, label %for.body, !prof !0
84+
85+
for.cond.cleanup: ; preds = %for.body
86+
ret void
87+
}
88+
89+
!0 = !{!"branch_weights", i32 1, i32 1023}
90+
;.
91+
; CHECK-V1-IC1: [[PROF0]] = !{!"branch_weights", i32 1, i32 127}
92+
; CHECK-V1-IC1: [[LOOP1]] = distinct !{[[LOOP1]], [[META2:![0-9]+]], [[META3:![0-9]+]]}
93+
; CHECK-V1-IC1: [[META2]] = !{!"llvm.loop.isvectorized", i32 1}
94+
; CHECK-V1-IC1: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"}
95+
; CHECK-V1-IC1: [[PROF4]] = !{!"branch_weights", i32 1, i32 3}
96+
; CHECK-V1-IC1: [[PROF5]] = !{!"branch_weights", i32 0, i32 0}
97+
; CHECK-V1-IC1: [[LOOP6]] = distinct !{[[LOOP6]], [[META3]], [[META2]]}
98+
;.
99+
; CHECK-V2-IC1: [[PROF0]] = !{!"branch_weights", i32 1, i32 127}
100+
; CHECK-V2-IC1: [[PROF1]] = !{!"branch_weights", i32 1, i32 255}
101+
; CHECK-V2-IC1: [[LOOP2]] = distinct !{[[LOOP2]], [[META3:![0-9]+]], [[META4:![0-9]+]]}
102+
; CHECK-V2-IC1: [[META3]] = !{!"llvm.loop.isvectorized", i32 1}
103+
; CHECK-V2-IC1: [[META4]] = !{!"llvm.loop.unroll.runtime.disable"}
104+
; CHECK-V2-IC1: [[PROF5]] = !{!"branch_weights", i32 1, i32 3}
105+
; CHECK-V2-IC1: [[PROF6]] = !{!"branch_weights", i32 0, i32 0}
106+
; CHECK-V2-IC1: [[LOOP7]] = distinct !{[[LOOP7]], [[META4]], [[META3]]}
107+
;.
108+
; CHECK-V2-IC4: [[PROF0]] = !{!"branch_weights", i32 1, i32 127}
109+
; CHECK-V2-IC4: [[PROF1]] = !{!"branch_weights", i32 1, i32 63}
110+
; CHECK-V2-IC4: [[LOOP2]] = distinct !{[[LOOP2]], [[META3:![0-9]+]], [[META4:![0-9]+]]}
111+
; CHECK-V2-IC4: [[META3]] = !{!"llvm.loop.isvectorized", i32 1}
112+
; CHECK-V2-IC4: [[META4]] = !{!"llvm.loop.unroll.runtime.disable"}
113+
; CHECK-V2-IC4: [[PROF5]] = !{!"branch_weights", i32 1, i32 15}
114+
; CHECK-V2-IC4: [[PROF6]] = !{!"branch_weights", i32 2, i32 0}
115+
; CHECK-V2-IC4: [[LOOP7]] = distinct !{[[LOOP7]], [[META3]], [[META4]]}
116+
; CHECK-V2-IC4: [[PROF8]] = !{!"branch_weights", i32 1, i32 1}
117+
; CHECK-V2-IC4: [[PROF9]] = !{!"branch_weights", i32 0, i32 0}
118+
; CHECK-V2-IC4: [[LOOP10]] = distinct !{[[LOOP10]], [[META4]], [[META3]]}
119+
;.

llvm/test/Transforms/LoopVectorize/check-prof-info.ll

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "br" --filter "^.*:"
22
; RUN: opt -passes="print<block-freq>,loop-vectorize" -force-vector-width=4 -force-vector-interleave=1 -S < %s | FileCheck %s
33
; RUN: opt -passes="print<block-freq>,loop-vectorize" -force-vector-width=4 -force-vector-interleave=4 -S < %s | FileCheck %s -check-prefix=CHECK-MASKED
4+
; RUN: opt -passes="print<block-freq>,loop-vectorize" -force-vector-width=4 -force-vector-interleave=1 \
5+
; RUN: -scalable-vectorization=on -force-target-supports-scalable-vectors -S < %s | FileCheck %s -check-prefix=CHECK-SCALABLE
46

57
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
68

@@ -39,6 +41,21 @@ define void @_Z3foov() {
3941
; CHECK-MASKED: for.body:
4042
; CHECK-MASKED: br i1 [[EXITCOND:%.*]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !prof [[PROF6:![0-9]+]], !llvm.loop [[LOOP7:![0-9]+]]
4143
;
44+
; CHECK-SCALABLE-LABEL: @_Z3foov(
45+
; CHECK-SCALABLE: entry:
46+
; CHECK-SCALABLE: br i1 [[MIN_ITERS_CHECK:%.*]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]], !prof [[PROF0:![0-9]+]]
47+
; CHECK-SCALABLE: vector.ph:
48+
; CHECK-SCALABLE: br label [[VECTOR_BODY:%.*]]
49+
; CHECK-SCALABLE: vector.body:
50+
; CHECK-SCALABLE: br i1 [[TMP16:%.*]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]]
51+
; CHECK-SCALABLE: middle.block:
52+
; CHECK-SCALABLE: br i1 [[CMP_N:%.*]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]], !prof [[PROF5:![0-9]+]]
53+
; CHECK-SCALABLE: scalar.ph:
54+
; CHECK-SCALABLE: br label [[FOR_BODY:%.*]]
55+
; CHECK-SCALABLE: for.cond.cleanup:
56+
; CHECK-SCALABLE: for.body:
57+
; CHECK-SCALABLE: br i1 [[EXITCOND:%.*]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !prof [[PROF6:![0-9]+]], !llvm.loop [[LOOP7:![0-9]+]]
58+
;
4259
entry:
4360
br label %for.body
4461

@@ -92,6 +109,21 @@ define void @_Z3foo2v() {
92109
; CHECK-MASKED: for.body:
93110
; CHECK-MASKED: br i1 [[EXITCOND:%.*]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !prof [[PROF9:![0-9]+]], !llvm.loop [[LOOP10:![0-9]+]]
94111
;
112+
; CHECK-SCALABLE-LABEL: @_Z3foo2v(
113+
; CHECK-SCALABLE: entry:
114+
; CHECK-SCALABLE: br i1 [[MIN_ITERS_CHECK:%.*]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]], !prof [[PROF0]]
115+
; CHECK-SCALABLE: vector.ph:
116+
; CHECK-SCALABLE: br label [[VECTOR_BODY:%.*]]
117+
; CHECK-SCALABLE: vector.body:
118+
; CHECK-SCALABLE: br i1 [[TMP16:%.*]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF1]], !llvm.loop [[LOOP8:![0-9]+]]
119+
; CHECK-SCALABLE: middle.block:
120+
; CHECK-SCALABLE: br i1 [[CMP_N:%.*]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]], !prof [[PROF5]]
121+
; CHECK-SCALABLE: scalar.ph:
122+
; CHECK-SCALABLE: br label [[FOR_BODY:%.*]]
123+
; CHECK-SCALABLE: for.cond.cleanup:
124+
; CHECK-SCALABLE: for.body:
125+
; CHECK-SCALABLE: br i1 [[EXITCOND:%.*]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !prof [[PROF9:![0-9]+]], !llvm.loop [[LOOP10:![0-9]+]]
126+
;
95127
entry:
96128
br label %for.body
97129

0 commit comments

Comments
 (0)