Skip to content

Commit 74614d3

Browse files
committed
Add the ExtNegatedMulAccumulateReduction bundle type
1 parent d985670 commit 74614d3

File tree

6 files changed

+346
-15
lines changed

6 files changed

+346
-15
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7050,8 +7050,8 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
70507050
}
70517051
// The VPlan-based cost model is more accurate for partial reduction and
70527052
// comparing against the legacy cost isn't desirable.
7053-
if (auto *VPR = dyn_cast<VPReductionRecipe>(&R); VPR &&
7054-
VPR->isPartialReduction())
7053+
if (auto *VPR = dyn_cast<VPReductionRecipe>(&R);
7054+
VPR && VPR->isPartialReduction())
70557055
return true;
70567056

70577057
/// If a VPlan transform folded a recipe to one producing a single-scalar,

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2702,6 +2702,11 @@ class VPSingleDefBundleRecipe : public VPSingleDefRecipe {
27022702
/// vector operands, performing a reduction.add on the result, and adding
27032703
/// the scalar result to a chain.
27042704
MulAccumulateReduction,
2705+
/// Represent an inloop multiply-accumulate reduction, multiplying the
2706+
/// extended vector operands, negating the multiplication, performing a reduction.add
2707+
/// on the result, and adding
2708+
/// the scalar result to a chain.
2709+
ExtNegatedMulAccumulateReduction,
27052710
};
27062711

27072712
/// Type of the bundle.
@@ -2729,7 +2734,7 @@ class VPSingleDefBundleRecipe : public VPSingleDefRecipe {
27292734
VPSingleDefBundleRecipe(VPWidenCastRecipe *Ext0, VPWidenCastRecipe *Ext1,
27302735
VPWidenRecipe *Mul, VPWidenRecipe *Sub,
27312736
VPReductionRecipe *Red)
2732-
: VPSingleDefBundleRecipe(BundleTypes::ExtMulAccumulateReduction,
2737+
: VPSingleDefBundleRecipe(BundleTypes::ExtNegatedMulAccumulateReduction,
27332738
{Ext0, Ext1, Mul, Sub, Red}) {}
27342739

27352740
~VPSingleDefBundleRecipe() override {

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2570,14 +2570,16 @@ InstructionCost VPSingleDefBundleRecipe::computeCost(ElementCount VF,
25702570
case BundleTypes::MulAccumulateReduction:
25712571
return Ctx.TTI.getMulAccReductionCost(false, RedTy, SrcVecTy, Ctx.CostKind);
25722572

2573+
case BundleTypes::ExtNegatedMulAccumulateReduction:
25732574
case BundleTypes::ExtMulAccumulateReduction: {
2575+
unsigned Opcode = BundleType == BundleTypes::ExtNegatedMulAccumulateReduction ? Instruction::Sub : Instruction::Add;
25742576
if (auto *RedR = dyn_cast<VPReductionRecipe>(BundledRecipes.back());
25752577
RedR->isPartialReduction() && BundledRecipes.size() >= 4) {
25762578
auto *Ext0R = cast<VPWidenCastRecipe>(BundledRecipes[0]);
25772579
auto *Ext1R = cast<VPWidenCastRecipe>(BundledRecipes[1]);
25782580
auto *Mul = cast<VPWidenRecipe>(BundledRecipes[2]);
25792581
return Ctx.TTI.getPartialReductionCost(
2580-
RecurrenceDescriptor::getOpcode(RedR->getRecurrenceKind()),
2582+
Opcode,
25812583
Ctx.Types.inferScalarType(getOperand(0)),
25822584
Ctx.Types.inferScalarType(getOperand(1)), RedTy, VF,
25832585
TargetTransformInfo::getPartialReductionExtendKind(
@@ -2633,6 +2635,33 @@ void VPSingleDefBundleRecipe::print(raw_ostream &O, const Twine &Indent,
26332635
O << ")";
26342636
break;
26352637
}
2638+
case BundleTypes::ExtNegatedMulAccumulateReduction: {
2639+
getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker);
2640+
O << " + ";
2641+
if (Red->isPartialReduction())
2642+
O << "partial.";
2643+
O << "reduce."
2644+
<< Instruction::getOpcodeName(
2645+
RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()))
2646+
<< " (sub (0, mul";
2647+
auto *Mul = cast<VPWidenRecipe>(BundledRecipes[2]);
2648+
Mul->printFlags(O);
2649+
O << "(";
2650+
getOperand(0)->printAsOperand(O, SlotTracker);
2651+
auto *Ext0 = cast<VPWidenCastRecipe>(BundledRecipes[0]);
2652+
O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
2653+
<< *Ext0->getResultType() << "), (";
2654+
getOperand(1)->printAsOperand(O, SlotTracker);
2655+
auto *Ext1 = cast<VPWidenCastRecipe>(BundledRecipes[1]);
2656+
O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to "
2657+
<< *Ext1->getResultType() << ")";
2658+
if (Red->isConditional()) {
2659+
O << ", ";
2660+
Red->getCondOp()->printAsOperand(O, SlotTracker);
2661+
}
2662+
O << "))";
2663+
break;
2664+
}
26362665
case BundleTypes::MulAccumulateReduction:
26372666
case BundleTypes::ExtMulAccumulateReduction: {
26382667
getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker);

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2895,7 +2895,7 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
28952895
// Clamp the range if using multiply-accumulate-reduction is profitable.
28962896
auto IsMulAccValidAndClampRange =
28972897
[&](bool IsZExt, VPWidenRecipe *Mul, VPWidenCastRecipe *Ext0,
2898-
VPWidenCastRecipe *Ext1, VPWidenCastRecipe *OuterExt) -> bool {
2898+
VPWidenCastRecipe *Ext1, VPWidenCastRecipe *OuterExt, std::optional<VPWidenRecipe*> Sub = std::nullopt) -> bool {
28992899
return LoopVectorizationPlanner::getDecisionAndClampRange(
29002900
[&](ElementCount VF) {
29012901
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
@@ -2906,6 +2906,7 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
29062906
auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy0, VF));
29072907
InstructionCost MulAccCost;
29082908
if (Red->isPartialReduction()) {
2909+
unsigned Opcode = Sub.has_value() ? Instruction::Sub : Instruction::Add;
29092910
TargetTransformInfo::PartialReductionExtendKind Ext0Kind =
29102911
Ext0 ? TargetTransformInfo::getPartialReductionExtendKind(
29112912
Ext0->getOpcode())
@@ -2941,21 +2942,25 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
29412942
};
29422943

29432944
VPValue *VecOp = Red->getVecOp();
2944-
VPValue *Mul = VecOp;
2945+
VPValue *Mul = nullptr;
2946+
VPValue *Sub = nullptr;
29452947
VPValue *A, *B;
29462948
// Some chained partial reductions used for complex numbers will have a
29472949
// negation between the mul and reduction. This extracts the mul from that
29482950
// pattern to use it for further checking. The sub should still be bundled.
2949-
if (Red->isPartialReduction())
2950-
match(Mul, m_Binary<Instruction::Sub>(m_SpecificInt(0), m_VPValue(Mul)));
2951+
if (match(VecOp,
2952+
m_Binary<Instruction::Sub>(m_SpecificInt(0), m_VPValue(Mul))))
2953+
Sub = VecOp;
2954+
else
2955+
Mul = VecOp;
29512956
// Try to match reduce.add(mul(...)).
29522957
if (match(Mul, m_Mul(m_VPValue(A), m_VPValue(B)))) {
29532958
auto *RecipeA =
29542959
dyn_cast_if_present<VPWidenCastRecipe>(A->getDefiningRecipe());
29552960
auto *RecipeB =
29562961
dyn_cast_if_present<VPWidenCastRecipe>(B->getDefiningRecipe());
29572962
auto *MulR = cast<VPWidenRecipe>(Mul->getDefiningRecipe());
2958-
auto *VecOpR = cast<VPWidenRecipe>(VecOp->getDefiningRecipe());
2963+
std::optional<VPWidenRecipe*> SubR = Sub ? std::make_optional(cast<VPWidenRecipe>(Sub->getDefiningRecipe())) : std::nullopt;
29592964

29602965
// Match reduce.add(mul(ext, ext)).
29612966
// Mixed extensions are valid for partial reductions
@@ -2966,12 +2971,12 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
29662971
match(RecipeB, m_ZExtOrSExt(m_VPValue())) &&
29672972
IsMulAccValidAndClampRange(RecipeA->getOpcode() ==
29682973
Instruction::CastOps::ZExt,
2969-
MulR, RecipeA, RecipeB, nullptr)) {
2970-
// If the vector operand is the same as the mul then there was no
2971-
// intervening sub
2972-
if (VecOpR == MulR)
2973-
return new VPSingleDefBundleRecipe(RecipeA, RecipeB, MulR, Red);
2974-
return new VPSingleDefBundleRecipe(RecipeA, RecipeB, MulR, VecOpR, Red);
2974+
MulR, RecipeA, RecipeB, nullptr, SubR)) {
2975+
if (Sub)
2976+
return new VPSingleDefBundleRecipe(
2977+
RecipeA, RecipeB, MulR,
2978+
cast<VPWidenRecipe>(Sub->getDefiningRecipe()), Red);
2979+
return new VPSingleDefBundleRecipe(RecipeA, RecipeB, MulR, Red);
29752980
}
29762981
// Match reduce.add(mul).
29772982
if (IsMulAccValidAndClampRange(true, MulR, nullptr, nullptr, nullptr))

llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll

Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,152 @@ for.exit: ; preds = %for.body
151151
ret i32 %add
152152
}
153153

154+
define i32 @dotp_sub(ptr %a, ptr %b) #0 {
155+
; CHECK-INTERLEAVE1-LABEL: define i32 @dotp_sub(
156+
; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
157+
; CHECK-INTERLEAVE1-NEXT: entry:
158+
; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
159+
; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
160+
; CHECK-INTERLEAVE1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
161+
; CHECK-INTERLEAVE1: vector.ph:
162+
; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
163+
; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
164+
; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
165+
; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
166+
; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
167+
; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
168+
; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
169+
; CHECK-INTERLEAVE1: vector.body:
170+
; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
171+
; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
172+
; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
173+
; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
174+
; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
175+
; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
176+
; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
177+
; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP9]], i32 0
178+
; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
179+
; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
180+
; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = mul <vscale x 4 x i32> [[TMP11]], [[TMP8]]
181+
; CHECK-INTERLEAVE1-NEXT: [[TMP13]] = sub <vscale x 4 x i32> [[VEC_PHI]], [[TMP12]]
182+
; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
183+
; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
184+
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
185+
; CHECK-INTERLEAVE1: middle.block:
186+
; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP13]])
187+
; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
188+
; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
189+
; CHECK-INTERLEAVE1: scalar.ph:
190+
;
191+
; CHECK-INTERLEAVED-LABEL: define i32 @dotp_sub(
192+
; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
193+
; CHECK-INTERLEAVED-NEXT: entry:
194+
; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
195+
; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
196+
; CHECK-INTERLEAVED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
197+
; CHECK-INTERLEAVED: vector.ph:
198+
; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
199+
; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
200+
; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
201+
; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
202+
; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
203+
; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8
204+
; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
205+
; CHECK-INTERLEAVED: vector.body:
206+
; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
207+
; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ]
208+
; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
209+
; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
210+
; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
211+
; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
212+
; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 4
213+
; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP6]], i64 [[TMP9]]
214+
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
215+
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
216+
; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
217+
; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
218+
; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
219+
; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[TMP13]], i32 0
220+
; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
221+
; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = mul nuw i64 [[TMP15]], 4
222+
; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP13]], i64 [[TMP16]]
223+
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i8>, ptr [[TMP14]], align 1
224+
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i8>, ptr [[TMP17]], align 1
225+
; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD3]] to <vscale x 4 x i32>
226+
; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD4]] to <vscale x 4 x i32>
227+
; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = mul <vscale x 4 x i32> [[TMP18]], [[TMP11]]
228+
; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = mul <vscale x 4 x i32> [[TMP19]], [[TMP12]]
229+
; CHECK-INTERLEAVED-NEXT: [[TMP22]] = sub <vscale x 4 x i32> [[VEC_PHI]], [[TMP20]]
230+
; CHECK-INTERLEAVED-NEXT: [[TMP23]] = sub <vscale x 4 x i32> [[VEC_PHI1]], [[TMP21]]
231+
; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
232+
; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
233+
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
234+
; CHECK-INTERLEAVED: middle.block:
235+
; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[TMP23]], [[TMP22]]
236+
; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
237+
; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
238+
; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
239+
; CHECK-INTERLEAVED: scalar.ph:
240+
;
241+
; CHECK-MAXBW-LABEL: define i32 @dotp_sub(
242+
; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
243+
; CHECK-MAXBW-NEXT: entry:
244+
; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
245+
; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16
246+
; CHECK-MAXBW-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
247+
; CHECK-MAXBW: vector.ph:
248+
; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
249+
; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
250+
; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
251+
; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
252+
; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
253+
; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 16
254+
; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
255+
; CHECK-MAXBW: vector.body:
256+
; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
257+
; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
258+
; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
259+
; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
260+
; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP7]], align 1
261+
; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
262+
; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP8]], i32 0
263+
; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 16 x i8>, ptr [[TMP9]], align 1
264+
; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD1]] to <vscale x 16 x i32>
265+
; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
266+
; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = mul <vscale x 16 x i32> [[TMP10]], [[TMP11]]
267+
; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[TMP12]]
268+
; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP13]])
269+
; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
270+
; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
271+
; CHECK-MAXBW-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
272+
; CHECK-MAXBW: middle.block:
273+
; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE]])
274+
; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
275+
; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
276+
; CHECK-MAXBW: scalar.ph:
277+
;
278+
entry:
279+
br label %for.body
280+
281+
for.body: ; preds = %for.body, %entry
282+
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
283+
%accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
284+
%gep.a = getelementptr i8, ptr %a, i64 %iv
285+
%load.a = load i8, ptr %gep.a, align 1
286+
%ext.a = zext i8 %load.a to i32
287+
%gep.b = getelementptr i8, ptr %b, i64 %iv
288+
%load.b = load i8, ptr %gep.b, align 1
289+
%ext.b = zext i8 %load.b to i32
290+
%mul = mul i32 %ext.b, %ext.a
291+
%add = sub i32 %accum, %mul
292+
%iv.next = add i64 %iv, 1
293+
%exitcond.not = icmp eq i64 %iv.next, 1024
294+
br i1 %exitcond.not, label %for.exit, label %for.body
295+
296+
for.exit: ; preds = %for.body
297+
ret i32 %add
298+
}
299+
154300
define i64 @not_dotp_i8_to_i64_has_neon_dotprod(ptr readonly %a, ptr readonly %b) #1 {
155301
; CHECK-INTERLEAVE1-LABEL: define i64 @not_dotp_i8_to_i64_has_neon_dotprod(
156302
; CHECK-INTERLEAVE1-SAME: ptr readonly [[A:%.*]], ptr readonly [[B:%.*]]) #[[ATTR1:[0-9]+]] {

0 commit comments

Comments
 (0)