diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index c43870392361d..8b93a3f564374 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1645,8 +1645,10 @@ class TargetTransformInfo { /// extensions. This is the cost of as: /// ResTy vecreduce.add(mul (A, B)). /// ResTy vecreduce.add(mul(ext(Ty A), ext(Ty B)). + /// The multiply can optionally be negated, which signifies that it is a sub + /// reduction. LLVM_ABI InstructionCost getMulAccReductionCost( - bool IsUnsigned, Type *ResTy, VectorType *Ty, + bool IsUnsigned, bool IsNegated, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const; /// Calculate the cost of an extended reduction pattern, similar to diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 12f87226c5f57..d92e0e3d6f2c6 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -959,8 +959,8 @@ class TargetTransformInfoImplBase { } virtual InstructionCost - getMulAccReductionCost(bool IsUnsigned, Type *ResTy, VectorType *Ty, - TTI::TargetCostKind CostKind) const { + getMulAccReductionCost(bool IsUnsigned, bool IsNegated, Type *ResTy, + VectorType *Ty, TTI::TargetCostKind CostKind) const { return 1; } diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index bf958e100f2ac..33382414f9919 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -3115,11 +3115,12 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { } InstructionCost - getMulAccReductionCost(bool IsUnsigned, Type *ResTy, VectorType *Ty, + getMulAccReductionCost(bool IsUnsigned, bool IsNegated, Type *ResTy, + VectorType *Ty, TTI::TargetCostKind CostKind) const override { // Without any native support, this is equivalent to the cost of // vecreduce.add(mul(ext(Ty A), ext(Ty B))) or - // vecreduce.add(mul(A, B)). + // vecreduce.add(mul(A, B)) with an optional negation of the mul. VectorType *ExtTy = VectorType::get(ResTy, Ty); InstructionCost RedCost = thisT()->getArithmeticReductionCost( Instruction::Add, ExtTy, std::nullopt, CostKind); @@ -3129,8 +3130,12 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { InstructionCost MulCost = thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind); + InstructionCost SubCost = + IsNegated + ? thisT()->getArithmeticInstrCost(Instruction::Sub, ExtTy, CostKind) + : 0; - return RedCost + MulCost + 2 * ExtCost; + return RedCost + SubCost + MulCost + 2 * ExtCost; } InstructionCost getVectorSplitCost() const { return 1; } diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 3ebd9d487ba04..08cacdc67ea0b 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1274,9 +1274,10 @@ InstructionCost TargetTransformInfo::getExtendedReductionCost( } InstructionCost TargetTransformInfo::getMulAccReductionCost( - bool IsUnsigned, Type *ResTy, VectorType *Ty, + bool IsUnsigned, bool IsNegated, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind) const { - return TTIImpl->getMulAccReductionCost(IsUnsigned, ResTy, Ty, CostKind); + return TTIImpl->getMulAccReductionCost(IsUnsigned, IsNegated, ResTy, Ty, + CostKind); } InstructionCost diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 380faa6cf6939..47331988ed8fc 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -5315,9 +5315,11 @@ InstructionCost AArch64TTIImpl::getExtendedReductionCost( } InstructionCost -AArch64TTIImpl::getMulAccReductionCost(bool IsUnsigned, Type *ResTy, - VectorType *VecTy, +AArch64TTIImpl::getMulAccReductionCost(bool IsUnsigned, bool IsNegated, + Type *ResTy, VectorType *VecTy, TTI::TargetCostKind CostKind) const { + if (IsNegated) + return InstructionCost::getInvalid(CostKind); EVT VecVT = TLI->getValueType(DL, VecTy); EVT ResVT = TLI->getValueType(DL, ResTy); @@ -5332,7 +5334,8 @@ AArch64TTIImpl::getMulAccReductionCost(bool IsUnsigned, Type *ResTy, return LT.first + 2; } - return BaseT::getMulAccReductionCost(IsUnsigned, ResTy, VecTy, CostKind); + return BaseT::getMulAccReductionCost(IsUnsigned, IsNegated, ResTy, VecTy, + CostKind); } InstructionCost diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index 9ada70bd7086a..228cccf0399a9 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -447,7 +447,7 @@ class AArch64TTIImpl final : public BasicTTIImplBase { TTI::TargetCostKind CostKind) const override; InstructionCost getMulAccReductionCost( - bool IsUnsigned, Type *ResTy, VectorType *Ty, + bool IsUnsigned, bool IsNegated, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const override; InstructionCost diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp index 203fb76d7be86..79d2d675410e9 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -1883,9 +1883,11 @@ InstructionCost ARMTTIImpl::getExtendedReductionCost( } InstructionCost -ARMTTIImpl::getMulAccReductionCost(bool IsUnsigned, Type *ResTy, +ARMTTIImpl::getMulAccReductionCost(bool IsUnsigned, bool IsNegated, Type *ResTy, VectorType *ValTy, TTI::TargetCostKind CostKind) const { + if (IsNegated) + return InstructionCost::getInvalid(CostKind); EVT ValVT = TLI->getValueType(DL, ValTy); EVT ResVT = TLI->getValueType(DL, ResTy); @@ -1906,7 +1908,8 @@ ARMTTIImpl::getMulAccReductionCost(bool IsUnsigned, Type *ResTy, return ST->getMVEVectorCostFactor(CostKind) * LT.first; } - return BaseT::getMulAccReductionCost(IsUnsigned, ResTy, ValTy, CostKind); + return BaseT::getMulAccReductionCost(IsUnsigned, IsNegated, ResTy, ValTy, + CostKind); } InstructionCost diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h index ca06b9e3cb661..325eab27d32c2 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -298,7 +298,8 @@ class ARMTTIImpl final : public BasicTTIImplBase { VectorType *ValTy, std::optional FMF, TTI::TargetCostKind CostKind) const override; InstructionCost - getMulAccReductionCost(bool IsUnsigned, Type *ResTy, VectorType *ValTy, + getMulAccReductionCost(bool IsUnsigned, bool IsNegated, Type *ResTy, + VectorType *ValTy, TTI::TargetCostKind CostKind) const override; InstructionCost diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 1cfbcf1336620..b80724f84c3db 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5538,7 +5538,8 @@ LoopVectorizationCostModel::getReductionPatternCost(Instruction *I, TTI::CastContextHint::None, CostKind, RedOp); InstructionCost RedCost = TTI.getMulAccReductionCost( - IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind); + IsUnsigned, /*IsNegated=*/false, RdxDesc.getRecurrenceType(), ExtType, + CostKind); if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost) @@ -5583,7 +5584,8 @@ LoopVectorizationCostModel::getReductionPatternCost(Instruction *I, TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); InstructionCost RedCost = TTI.getMulAccReductionCost( - IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind); + IsUnsigned, /*IsNegated=*/false, RdxDesc.getRecurrenceType(), ExtType, + CostKind); InstructionCost ExtraExtCost = 0; if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) { Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1; @@ -5602,7 +5604,8 @@ LoopVectorizationCostModel::getReductionPatternCost(Instruction *I, TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); InstructionCost RedCost = TTI.getMulAccReductionCost( - true, RdxDesc.getRecurrenceType(), VectorTy, CostKind); + true, /*IsNegated=*/false, RdxDesc.getRecurrenceType(), VectorTy, + CostKind); if (RedCost.isValid() && RedCost < MulCost + BaseCost) return I == RetI ? RedCost : 0; diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index d460573f5bec6..4fedbd7d36b77 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -2757,6 +2757,10 @@ class VPExpressionRecipe : public VPSingleDefRecipe { /// vector operands, performing a reduction.add on the result, and adding /// the scalar result to a chain. MulAccReduction, + /// Represent an inloop multiply-accumulate reduction, multiplying the + /// extended vector operands, negating the multiplication, performing a + /// reduction.add on the result, and adding the scalar result to a chain. + ExtNegatedMulAccReduction, }; /// Type of the expression. @@ -2780,6 +2784,11 @@ class VPExpressionRecipe : public VPSingleDefRecipe { VPWidenRecipe *Mul, VPReductionRecipe *Red) : VPExpressionRecipe(ExpressionTypes::ExtMulAccReduction, {Ext0, Ext1, Mul, Red}) {} + VPExpressionRecipe(VPWidenCastRecipe *Ext0, VPWidenCastRecipe *Ext1, + VPWidenRecipe *Mul, VPWidenRecipe *Sub, + VPReductionRecipe *Red) + : VPExpressionRecipe(ExpressionTypes::ExtNegatedMulAccReduction, + {Ext0, Ext1, Mul, Sub, Red}) {} ~VPExpressionRecipe() override { for (auto *R : reverse(ExpressionRecipes)) diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 318e8171e098d..1c3a5e70fd45b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -2672,13 +2672,17 @@ InstructionCost VPExpressionRecipe::computeCost(ElementCount VF, RedTy, SrcVecTy, std::nullopt, Ctx.CostKind); } case ExpressionTypes::MulAccReduction: - return Ctx.TTI.getMulAccReductionCost(false, RedTy, SrcVecTy, Ctx.CostKind); + return Ctx.TTI.getMulAccReductionCost(false, false, RedTy, SrcVecTy, + Ctx.CostKind); - case ExpressionTypes::ExtMulAccReduction: + case ExpressionTypes::ExtNegatedMulAccReduction: + case ExpressionTypes::ExtMulAccReduction: { + bool Negated = ExpressionType == ExpressionTypes::ExtNegatedMulAccReduction; return Ctx.TTI.getMulAccReductionCost( cast(ExpressionRecipes.front())->getOpcode() == Instruction::ZExt, - RedTy, SrcVecTy, Ctx.CostKind); + Negated, RedTy, SrcVecTy, Ctx.CostKind); + } } llvm_unreachable("Unknown VPExpressionRecipe::ExpressionTypes enum"); } @@ -2725,6 +2729,31 @@ void VPExpressionRecipe::print(raw_ostream &O, const Twine &Indent, O << ")"; break; } + case ExpressionTypes::ExtNegatedMulAccReduction: { + getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker); + O << " + "; + O << "reduce." + << Instruction::getOpcodeName( + RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind())) + << " (sub (0, mul"; + auto *Mul = cast(ExpressionRecipes[2]); + Mul->printFlags(O); + O << "("; + getOperand(0)->printAsOperand(O, SlotTracker); + auto *Ext0 = cast(ExpressionRecipes[0]); + O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to " + << *Ext0->getResultType() << "), ("; + getOperand(1)->printAsOperand(O, SlotTracker); + auto *Ext1 = cast(ExpressionRecipes[1]); + O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to " + << *Ext1->getResultType() << ")"; + if (Red->isConditional()) { + O << ", "; + Red->getCondOp()->printAsOperand(O, SlotTracker); + } + O << "))"; + break; + } case ExpressionTypes::MulAccReduction: case ExpressionTypes::ExtMulAccReduction: { getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 931d4d42f56e4..b688dd4a251ab 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -2908,16 +2908,17 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, // Clamp the range if using multiply-accumulate-reduction is profitable. auto IsMulAccValidAndClampRange = - [&](bool isZExt, VPWidenRecipe *Mul, VPWidenCastRecipe *Ext0, - VPWidenCastRecipe *Ext1, VPWidenCastRecipe *OuterExt) -> bool { + [&](bool IsZExt, VPWidenRecipe *Mul, VPWidenCastRecipe *Ext0, + VPWidenCastRecipe *Ext1, VPWidenCastRecipe *OuterExt, + bool Negated = false) -> bool { return LoopVectorizationPlanner::getDecisionAndClampRange( [&](ElementCount VF) { TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; Type *SrcTy = Ext0 ? Ctx.Types.inferScalarType(Ext0->getOperand(0)) : RedTy; auto *SrcVecTy = cast(toVectorTy(SrcTy, VF)); - InstructionCost MulAccCost = - Ctx.TTI.getMulAccReductionCost(isZExt, RedTy, SrcVecTy, CostKind); + InstructionCost MulAccCost = Ctx.TTI.getMulAccReductionCost( + IsZExt, Negated, RedTy, SrcVecTy, CostKind); InstructionCost MulCost = Mul->computeCost(VF, Ctx); InstructionCost RedCost = Red->computeCost(VF, Ctx); InstructionCost ExtCost = 0; @@ -2935,14 +2936,22 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, }; VPValue *VecOp = Red->getVecOp(); + VPValue *Mul = nullptr; + VPValue *Sub = nullptr; VPValue *A, *B; + // Sub reductions will have a sub between the add reduction and vec op. + if (match(VecOp, + m_Binary(m_SpecificInt(0), m_VPValue(Mul)))) + Sub = VecOp; + else + Mul = VecOp; // Try to match reduce.add(mul(...)). - if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) { + if (match(Mul, m_Mul(m_VPValue(A), m_VPValue(B)))) { auto *RecipeA = dyn_cast_if_present(A->getDefiningRecipe()); auto *RecipeB = dyn_cast_if_present(B->getDefiningRecipe()); - auto *Mul = cast(VecOp->getDefiningRecipe()); + auto *MulR = cast(Mul->getDefiningRecipe()); // Match reduce.add(mul(ext, ext)). if (RecipeA && RecipeB && @@ -2951,12 +2960,16 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, match(RecipeB, m_ZExtOrSExt(m_VPValue())) && IsMulAccValidAndClampRange(RecipeA->getOpcode() == Instruction::CastOps::ZExt, - Mul, RecipeA, RecipeB, nullptr)) { - return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red); + MulR, RecipeA, RecipeB, nullptr, Sub)) { + if (Sub) + return new VPExpressionRecipe( + RecipeA, RecipeB, MulR, + cast(Sub->getDefiningRecipe()), Red); + return new VPExpressionRecipe(RecipeA, RecipeB, MulR, Red); } // Match reduce.add(mul). - if (IsMulAccValidAndClampRange(true, Mul, nullptr, nullptr, nullptr)) - return new VPExpressionRecipe(Mul, Red); + if (IsMulAccValidAndClampRange(true, MulR, nullptr, nullptr, nullptr, Sub)) + return new VPExpressionRecipe(MulR, Red); } // Match reduce.add(ext(mul(ext(A), ext(B)))). // All extend recipes must have same opcode or A == B diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index b2fced47b9527..a9b7785c5f341 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -1401,8 +1401,8 @@ static void analyzeCostOfVecReduction(const IntrinsicInst &II, TTI::CastContextHint::None, CostKind, RedOp); CostBeforeReduction = ExtCost * 2 + MulCost + Ext2Cost; - CostAfterReduction = - TTI.getMulAccReductionCost(IsUnsigned, II.getType(), ExtType, CostKind); + CostAfterReduction = TTI.getMulAccReductionCost( + IsUnsigned, /*IsNegated=*/false, II.getType(), ExtType, CostKind); return; } CostAfterReduction = TTI.getArithmeticReductionCost(ReductionOpc, VecRedTy, diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll index 4af3fa9202c77..8059ac12ecd2e 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll @@ -416,3 +416,146 @@ exit: %r.0.lcssa = phi i64 [ %rdx.next, %loop ] ret i64 %r.0.lcssa } + +define i32 @print_mulacc_sub(ptr %a, ptr %b) { +; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { +; CHECK-NEXT: Live-in vp<%0> = VF +; CHECK-NEXT: Live-in vp<%1> = VF * UF +; CHECK-NEXT: Live-in vp<%2> = vector-trip-count +; CHECK-NEXT: Live-in ir<1024> = original trip-count +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: Successor(s): scalar.ph, vector.ph +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: EMIT vp<%3> = reduction-start-vector ir<0>, ir<0>, ir<1> +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<%4> = CANONICAL-INDUCTION ir<0>, vp<%index.next> +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%accum> = phi vp<%3>, vp<%8> +; CHECK-NEXT: vp<%5> = SCALAR-STEPS vp<%4>, ir<1>, vp<%0> +; CHECK-NEXT: CLONE ir<%gep.a> = getelementptr ir<%a>, vp<%5> +; CHECK-NEXT: vp<%6> = vector-pointer ir<%gep.a> +; CHECK-NEXT: WIDEN ir<%load.a> = load vp<%6> +; CHECK-NEXT: CLONE ir<%gep.b> = getelementptr ir<%b>, vp<%5> +; CHECK-NEXT: vp<%7> = vector-pointer ir<%gep.b> +; CHECK-NEXT: WIDEN ir<%load.b> = load vp<%7> +; CHECK-NEXT: EXPRESSION vp<%8> = ir<%accum> + reduce.add (sub (0, mul (ir<%load.b> zext to i32), (ir<%load.a> zext to i32))) +; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%4>, vp<%1> +; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<%2> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): middle.block +; CHECK-EMPTY: +; CHECK-NEXT: middle.block: +; CHECK-NEXT: EMIT vp<%10> = compute-reduction-result ir<%accum>, vp<%8> +; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq ir<1024>, vp<%2> +; CHECK-NEXT: EMIT branch-on-cond vp<%cmp.n> +; CHECK-NEXT: Successor(s): ir-bb, scalar.ph +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %add.lcssa = phi i32 [ %add, %for.body ] (extra operand: vp<%10> from middle.block) +; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: scalar.ph: +; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%2>, middle.block ], [ ir<0>, ir-bb ] +; CHECK-NEXT: EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%10>, middle.block ], [ ir<0>, ir-bb ] +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] (extra operand: vp<%bc.resume.val> from scalar.ph) +; CHECK-NEXT: IR %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] (extra operand: vp<%bc.merge.rdx> from scalar.ph) +; CHECK-NEXT: IR %gep.a = getelementptr i8, ptr %a, i64 %iv +; CHECK-NEXT: IR %load.a = load i8, ptr %gep.a, align 1 +; CHECK-NEXT: IR %ext.a = zext i8 %load.a to i32 +; CHECK-NEXT: IR %gep.b = getelementptr i8, ptr %b, i64 %iv +; CHECK-NEXT: IR %load.b = load i8, ptr %gep.b, align 1 +; CHECK-NEXT: IR %ext.b = zext i8 %load.b to i32 +; CHECK-NEXT: IR %mul = mul i32 %ext.b, %ext.a +; CHECK-NEXT: IR %add = sub i32 %accum, %mul +; CHECK-NEXT: IR %iv.next = add i64 %iv, 1 +; CHECK-NEXT: IR %exitcond.not = icmp eq i64 %iv.next, 1024 +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK: VPlan 'Final VPlan for VF={4},UF={1}' { +; CHECK-NEXT: Live-in ir<4> = VF * UF +; CHECK-NEXT: Live-in ir<1024> = vector-trip-count +; CHECK-NEXT: Live-in ir<1024> = original trip-count +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: Successor(s): ir-bb, ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: Successor(s): vector.body +; CHECK-EMPTY: +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT-SCALAR vp<%index> = phi [ ir<0>, ir-bb ], [ vp<%index.next>, vector.body ] +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%accum> = phi ir<0>, ir<%add>.1 +; CHECK-NEXT: CLONE ir<%gep.a> = getelementptr ir<%a>, vp<%index> +; CHECK-NEXT: vp<%1> = vector-pointer ir<%gep.a> +; CHECK-NEXT: WIDEN ir<%load.a> = load vp<%1> +; CHECK-NEXT: CLONE ir<%gep.b> = getelementptr ir<%b>, vp<%index> +; CHECK-NEXT: vp<%2> = vector-pointer ir<%gep.b> +; CHECK-NEXT: WIDEN ir<%load.b> = load vp<%2> +; CHECK-NEXT: WIDEN-CAST ir<%ext.b> = zext ir<%load.b> to i32 +; CHECK-NEXT: WIDEN-CAST ir<%ext.a> = zext ir<%load.a> to i32 +; CHECK-NEXT: WIDEN ir<%mul> = mul ir<%ext.b>, ir<%ext.a> +; CHECK-NEXT: WIDEN ir<%add> = sub ir<0>, ir<%mul> +; CHECK-NEXT: REDUCE ir<%add>.1 = ir<%accum> + reduce.add (ir<%add>) +; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%index>, ir<4> +; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, ir<1024> +; CHECK-NEXT: Successor(s): middle.block, vector.body +; CHECK-EMPTY: +; CHECK-NEXT: middle.block: +; CHECK-NEXT: EMIT vp<%4> = compute-reduction-result ir<%accum>, ir<%add>.1 +; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq ir<1024>, ir<1024> +; CHECK-NEXT: EMIT branch-on-cond vp<%cmp.n> +; CHECK-NEXT: Successor(s): ir-bb, ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %add.lcssa = phi i32 [ %add, %for.body ] (extra operand: vp<%4> from middle.block) +; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ ir<1024>, middle.block ], [ ir<0>, ir-bb ] +; CHECK-NEXT: EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%4>, middle.block ], [ ir<0>, ir-bb ] +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %iv = phi i64 [ 0, %scalar.ph ], [ %iv.next, %for.body ] (extra operand: vp<%bc.resume.val> from ir-bb) +; CHECK-NEXT: IR %accum = phi i32 [ 0, %scalar.ph ], [ %add, %for.body ] (extra operand: vp<%bc.merge.rdx> from ir-bb) +; CHECK-NEXT: IR %gep.a = getelementptr i8, ptr %a, i64 %iv +; CHECK-NEXT: IR %load.a = load i8, ptr %gep.a, align 1 +; CHECK-NEXT: IR %ext.a = zext i8 %load.a to i32 +; CHECK-NEXT: IR %gep.b = getelementptr i8, ptr %b, i64 %iv +; CHECK-NEXT: IR %load.b = load i8, ptr %gep.b, align 1 +; CHECK-NEXT: IR %ext.b = zext i8 %load.b to i32 +; CHECK-NEXT: IR %mul = mul i32 %ext.b, %ext.a +; CHECK-NEXT: IR %add = sub i32 %accum, %mul +; CHECK-NEXT: IR %iv.next = add i64 %iv, 1 +; CHECK-NEXT: IR %exitcond.not = icmp eq i64 %iv.next, 1024 +; CHECK-NEXT: No successors +; CHECK-NEXT: } +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = zext i8 %load.a to i32 + %gep.b = getelementptr i8, ptr %b, i64 %iv + %load.b = load i8, ptr %gep.b, align 1 + %ext.b = zext i8 %load.b to i32 + %mul = mul i32 %ext.b, %ext.a + %add = sub i32 %accum, %mul + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + ret i32 %add +}