Skip to content

Commit 39f3dab

Browse files
committed
[LV] Create in-loop sub reductions
This PR allows the loop vectorizer to handle sub reductions by forming a normal add reduction with a negated input.
1 parent 7d52b09 commit 39f3dab

File tree

3 files changed

+17
-4
lines changed

3 files changed

+17
-4
lines changed

llvm/lib/Analysis/IVDescriptors.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1263,6 +1263,10 @@ RecurrenceDescriptor::getReductionOpChain(PHINode *Phi, Loop *L) const {
12631263
if (isFMulAddIntrinsic(Cur))
12641264
return true;
12651265

1266+
// Recognize a sub reduction. It gets canonicalized to add(sub (0, ...)).
1267+
if (Cur->getOpcode() == Instruction::Sub && getOpcode() == Instruction::Add)
1268+
return true;
1269+
12661270
return Cur->getOpcode() == getOpcode();
12671271
};
12681272

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9144,6 +9144,14 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
91449144
CurrentLinkI->getFastMathFlags());
91459145
LinkVPBB->insert(FMulRecipe, CurrentLink->getIterator());
91469146
VecOp = FMulRecipe;
9147+
} else if (PhiR->isInLoop() && Kind == RecurKind::Add &&
9148+
CurrentLinkI->getOpcode() == Instruction::Sub) {
9149+
Type *PhiTy = PhiR->getUnderlyingValue()->getType();
9150+
auto *Zero = Plan->getOrAddLiveIn(ConstantInt::get(PhiTy, 0));
9151+
VPWidenRecipe *Sub = new VPWidenRecipe(
9152+
*CurrentLinkI, {Zero, CurrentLink->getOperand(1)});
9153+
LinkVPBB->insert(Sub, CurrentLink->getIterator());
9154+
VecOp = Sub;
91479155
} else {
91489156
if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
91499157
if (isa<VPWidenRecipe>(CurrentLink)) {

llvm/test/Transforms/LoopVectorize/reduction-inloop.ll

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -627,7 +627,7 @@ for.end: ; preds = %for.body, %entry
627627
ret float %result.0.lcssa
628628
}
629629

630-
; Sub we can create a reduction, but not inloop
630+
; Sub we can create a reduction inloop
631631
define i32 @reduction_sub_lhs(ptr noalias nocapture %A) {
632632
; CHECK-LABEL: @reduction_sub_lhs(
633633
; CHECK-NEXT: entry:
@@ -636,15 +636,16 @@ define i32 @reduction_sub_lhs(ptr noalias nocapture %A) {
636636
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
637637
; CHECK: vector.body:
638638
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
639-
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP1:%.*]], [[VECTOR_BODY]] ]
639+
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
640640
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
641641
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4
642-
; CHECK-NEXT: [[TMP1]] = sub <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
642+
; CHECK-NEXT: [[TMP4:%.*]] = sub nsw <4 x i32> zeroinitializer, [[WIDE_LOAD]]
643+
; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
644+
; CHECK-NEXT: [[TMP3]] = add i32 [[TMP1]], [[VEC_PHI]]
643645
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
644646
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
645647
; CHECK-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
646648
; CHECK: middle.block:
647-
; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]])
648649
; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
649650
; CHECK: scalar.ph:
650651
; CHECK-NEXT: br label [[FOR_BODY:%.*]]

0 commit comments

Comments
 (0)