Skip to content

Commit 5e54c92

Browse files
authored
[VPlan] Fix crash when unrolling in-loop reduction chains (llvm#129840)
If an in-loop reduction is chained e.g. WIDEN-REDUCTION-PHI ir<%rdx> = phi ir<0>, ir<%add2> REDUCE ir<%add1> = ir<%rdx> + reduce.add (ir<%x>) REDUCE ir<%add2> = ir<%add1> + reduce.add (ir<%y>) When we try to unroll the second add reduction, we crash because we currently expect the chain to be a VPReductionPHIRecipe, when in fact it's the previous reduction. This relaxes the cast to a dyn_cast, so we end up unrolling to: WIDEN-REDUCTION-PHI ir<%rdx> = phi ir<0>, ir<%add2> WIDEN-REDUCTION-PHI ir<%rdx>.1 = phi ir<0>, ir<%add2>.1, ir<1> WIDEN-REDUCTION-PHI ir<%rdx>.2 = phi ir<0>, ir<%add2>.2, ir<2> WIDEN-REDUCTION-PHI ir<%rdx>.3 = phi ir<0>, ir<%add2>.3, ir<3> REDUCE ir<%add1> = ir<%rdx> + reduce.add (ir<%x>) REDUCE ir<%add1>.1 = ir<%rdx>.1 + reduce.add (ir<%x>.1) REDUCE ir<%add1>.2 = ir<%rdx>.2 + reduce.add (ir<%x>.2) REDUCE ir<%add1>.3 = ir<%rdx>.3 + reduce.add (ir<%x>.3) REDUCE ir<%add2> = ir<%add1> + reduce.add (ir<%y>) REDUCE ir<%add2>.1 = ir<%add1>.1 + reduce.add (ir<%y>.1) REDUCE ir<%add2>.2 = ir<%add1>.2 + reduce.add (ir<%y>.2) REDUCE ir<%add2>.3 = ir<%add1>.3 + reduce.add (ir<%y>.3) This fixes a crash when building 525.x264_r from SPEC CPU 2017 on AArch64 with -mllvm -prefer-inloop-reductions
1 parent ea15e8b commit 5e54c92

File tree

2 files changed

+91
-6
lines changed

2 files changed

+91
-6
lines changed

llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -295,8 +295,8 @@ void UnrollState::unrollRecipeByUF(VPRecipeBase &R) {
295295
continue;
296296
}
297297
if (auto *Red = dyn_cast<VPReductionRecipe>(&R)) {
298-
auto *Phi = cast<VPReductionPHIRecipe>(R.getOperand(0));
299-
if (Phi->isOrdered()) {
298+
auto *Phi = dyn_cast<VPReductionPHIRecipe>(R.getOperand(0));
299+
if (Phi && Phi->isOrdered()) {
300300
auto &Parts = VPV2Parts[Phi];
301301
if (Part == 1) {
302302
Parts.clear();

llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll

Lines changed: 89 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,91 @@ entry:
6666
ret i32 %sum.0.lcssa
6767
}
6868

69+
; Check that we correctly unroll two reductions chained together.
70+
define i64 @reduction_sum_chain(ptr noalias %p, ptr noalias %q) {
71+
; CHECK-LABEL: @reduction_sum_chain(
72+
; CHECK-NEXT: entry:
73+
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
74+
; CHECK: vector.ph:
75+
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
76+
; CHECK: vector.body:
77+
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
78+
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
79+
; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
80+
; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
81+
; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
82+
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[INDEX]]
83+
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[INDEX]]
84+
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i64 32
85+
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 64
86+
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[TMP0]], i64 96
87+
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP0]], align 8
88+
; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
89+
; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
90+
; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
91+
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP1]], i64 32
92+
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[TMP1]], i64 64
93+
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP1]], i64 96
94+
; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8
95+
; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8
96+
; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <4 x i64>, ptr [[TMP6]], align 8
97+
; CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x i64>, ptr [[TMP7]], align 8
98+
; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[WIDE_LOAD]])
99+
; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[TMP8]], [[VEC_PHI]]
100+
; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[WIDE_LOAD4]])
101+
; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[TMP10]], [[VEC_PHI1]]
102+
; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[WIDE_LOAD5]])
103+
; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[TMP12]], [[VEC_PHI2]]
104+
; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[WIDE_LOAD6]])
105+
; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[TMP14]], [[VEC_PHI3]]
106+
; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[WIDE_LOAD7]])
107+
; CHECK-NEXT: [[TMP17]] = add i64 [[TMP16]], [[TMP9]]
108+
; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[WIDE_LOAD8]])
109+
; CHECK-NEXT: [[TMP19]] = add i64 [[TMP18]], [[TMP11]]
110+
; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[WIDE_LOAD9]])
111+
; CHECK-NEXT: [[TMP21]] = add i64 [[TMP20]], [[TMP13]]
112+
; CHECK-NEXT: [[TMP22:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[WIDE_LOAD10]])
113+
; CHECK-NEXT: [[TMP23]] = add i64 [[TMP22]], [[TMP15]]
114+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
115+
; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
116+
; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
117+
; CHECK: middle.block:
118+
; CHECK-NEXT: [[BIN_RDX:%.*]] = add i64 [[TMP19]], [[TMP17]]
119+
; CHECK-NEXT: [[BIN_RDX11:%.*]] = add i64 [[TMP21]], [[BIN_RDX]]
120+
; CHECK-NEXT: [[BIN_RDX12:%.*]] = add i64 [[TMP23]], [[BIN_RDX11]]
121+
; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
122+
; CHECK: scalar.ph:
123+
; CHECK-NEXT: br label [[LOOP:%.*]]
124+
; CHECK: loop:
125+
; CHECK-NEXT: br i1 poison, label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
126+
; CHECK: exit:
127+
; CHECK-NEXT: [[ADD2_LCSSA:%.*]] = phi i64 [ poison, [[LOOP]] ], [ [[BIN_RDX12]], [[MIDDLE_BLOCK]] ]
128+
; CHECK-NEXT: ret i64 [[ADD2_LCSSA]]
129+
;
130+
entry:
131+
br label %loop
132+
133+
loop:
134+
%iv = phi i64 [0, %entry], [%iv.next, %loop]
135+
%rdx = phi i64 [0, %entry], [%add2, %loop]
136+
137+
%p.gep = getelementptr i64, ptr %p, i64 %iv
138+
%q.gep = getelementptr i64, ptr %q, i64 %iv
139+
140+
%x = load i64, ptr %p.gep
141+
%y = load i64, ptr %q.gep
142+
143+
%add1 = add i64 %rdx, %x
144+
%add2 = add i64 %add1, %y
145+
146+
%iv.next = add i64 %iv, 1
147+
%done = icmp eq i64 %iv.next, 256
148+
br i1 %done, label %exit, label %loop
149+
150+
exit:
151+
ret i64 %add2
152+
}
153+
69154
define i32 @predicated(ptr noalias nocapture %A) {
70155
; CHECK-LABEL: @predicated(
71156
; CHECK-NEXT: entry:
@@ -260,7 +345,7 @@ define i32 @predicated(ptr noalias nocapture %A) {
260345
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
261346
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 16)
262347
; CHECK-NEXT: [[TMP111:%.*]] = icmp eq i64 [[INDEX_NEXT]], 272
263-
; CHECK-NEXT: br i1 [[TMP111]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
348+
; CHECK-NEXT: br i1 [[TMP111]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
264349
; CHECK: middle.block:
265350
; CHECK-NEXT: [[BIN_RDX:%.*]] = add i32 [[TMP104]], [[TMP101]]
266351
; CHECK-NEXT: [[BIN_RDX37:%.*]] = add i32 [[TMP107]], [[BIN_RDX]]
@@ -269,7 +354,7 @@ define i32 @predicated(ptr noalias nocapture %A) {
269354
; CHECK: scalar.ph:
270355
; CHECK-NEXT: br label [[DOTLR_PH:%.*]]
271356
; CHECK: .lr.ph:
272-
; CHECK-NEXT: br i1 poison, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], !llvm.loop [[LOOP5:![0-9]+]]
357+
; CHECK-NEXT: br i1 poison, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], !llvm.loop [[LOOP7:![0-9]+]]
273358
; CHECK: ._crit_edge:
274359
; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ poison, [[DOTLR_PH]] ], [ [[BIN_RDX38]], [[MIDDLE_BLOCK]] ]
275360
; CHECK-NEXT: ret i32 [[SUM_0_LCSSA]]
@@ -499,7 +584,7 @@ define i32 @cond_rdx_pred(i32 %cond, ptr noalias %a, i64 %N) {
499584
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16
500585
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 16)
501586
; CHECK-NEXT: [[TMP119:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
502-
; CHECK-NEXT: br i1 [[TMP119]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
587+
; CHECK-NEXT: br i1 [[TMP119]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
503588
; CHECK: middle.block:
504589
; CHECK-NEXT: [[BIN_RDX:%.*]] = mul i32 [[TMP112]], [[TMP109]]
505590
; CHECK-NEXT: [[BIN_RDX39:%.*]] = mul i32 [[TMP115]], [[BIN_RDX]]
@@ -512,7 +597,7 @@ define i32 @cond_rdx_pred(i32 %cond, ptr noalias %a, i64 %N) {
512597
; CHECK: if.then:
513598
; CHECK-NEXT: br label [[FOR_INC]]
514599
; CHECK: for.inc:
515-
; CHECK-NEXT: br i1 poison, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
600+
; CHECK-NEXT: br i1 poison, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
516601
; CHECK: for.end:
517602
; CHECK-NEXT: [[RES_LCSSA:%.*]] = phi i32 [ poison, [[FOR_INC]] ], [ [[BIN_RDX40]], [[MIDDLE_BLOCK]] ]
518603
; CHECK-NEXT: ret i32 [[RES_LCSSA]]

0 commit comments

Comments
 (0)