Skip to content

Commit 45891ae

Browse files
committed
Add the ExtNegatedMulAccumulateReduction bundle type
1 parent d985670 commit 45891ae

File tree

6 files changed

+354
-16
lines changed

6 files changed

+354
-16
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7050,8 +7050,8 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
70507050
}
70517051
// The VPlan-based cost model is more accurate for partial reduction and
70527052
// comparing against the legacy cost isn't desirable.
7053-
if (auto *VPR = dyn_cast<VPReductionRecipe>(&R); VPR &&
7054-
VPR->isPartialReduction())
7053+
if (auto *VPR = dyn_cast<VPReductionRecipe>(&R);
7054+
VPR && VPR->isPartialReduction())
70557055
return true;
70567056

70577057
/// If a VPlan transform folded a recipe to one producing a single-scalar,

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2702,6 +2702,12 @@ class VPSingleDefBundleRecipe : public VPSingleDefRecipe {
27022702
/// vector operands, performing a reduction.add on the result, and adding
27032703
/// the scalar result to a chain.
27042704
MulAccumulateReduction,
2705+
/// Represent an inloop multiply-accumulate reduction, multiplying the
2706+
/// extended vector operands, negating the multiplication, performing a
2707+
/// reduction.add
2708+
/// on the result, and adding
2709+
/// the scalar result to a chain.
2710+
ExtNegatedMulAccumulateReduction,
27052711
};
27062712

27072713
/// Type of the bundle.
@@ -2729,7 +2735,7 @@ class VPSingleDefBundleRecipe : public VPSingleDefRecipe {
27292735
VPSingleDefBundleRecipe(VPWidenCastRecipe *Ext0, VPWidenCastRecipe *Ext1,
27302736
VPWidenRecipe *Mul, VPWidenRecipe *Sub,
27312737
VPReductionRecipe *Red)
2732-
: VPSingleDefBundleRecipe(BundleTypes::ExtMulAccumulateReduction,
2738+
: VPSingleDefBundleRecipe(BundleTypes::ExtNegatedMulAccumulateReduction,
27332739
{Ext0, Ext1, Mul, Sub, Red}) {}
27342740

27352741
~VPSingleDefBundleRecipe() override {

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2570,15 +2570,19 @@ InstructionCost VPSingleDefBundleRecipe::computeCost(ElementCount VF,
25702570
case BundleTypes::MulAccumulateReduction:
25712571
return Ctx.TTI.getMulAccReductionCost(false, RedTy, SrcVecTy, Ctx.CostKind);
25722572

2573+
case BundleTypes::ExtNegatedMulAccumulateReduction:
25732574
case BundleTypes::ExtMulAccumulateReduction: {
2575+
unsigned Opcode =
2576+
BundleType == BundleTypes::ExtNegatedMulAccumulateReduction
2577+
? Instruction::Sub
2578+
: Instruction::Add;
25742579
if (auto *RedR = dyn_cast<VPReductionRecipe>(BundledRecipes.back());
25752580
RedR->isPartialReduction() && BundledRecipes.size() >= 4) {
25762581
auto *Ext0R = cast<VPWidenCastRecipe>(BundledRecipes[0]);
25772582
auto *Ext1R = cast<VPWidenCastRecipe>(BundledRecipes[1]);
25782583
auto *Mul = cast<VPWidenRecipe>(BundledRecipes[2]);
25792584
return Ctx.TTI.getPartialReductionCost(
2580-
RecurrenceDescriptor::getOpcode(RedR->getRecurrenceKind()),
2581-
Ctx.Types.inferScalarType(getOperand(0)),
2585+
Opcode, Ctx.Types.inferScalarType(getOperand(0)),
25822586
Ctx.Types.inferScalarType(getOperand(1)), RedTy, VF,
25832587
TargetTransformInfo::getPartialReductionExtendKind(
25842588
Ext0R->getOpcode()),
@@ -2633,6 +2637,33 @@ void VPSingleDefBundleRecipe::print(raw_ostream &O, const Twine &Indent,
26332637
O << ")";
26342638
break;
26352639
}
2640+
case BundleTypes::ExtNegatedMulAccumulateReduction: {
2641+
getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker);
2642+
O << " + ";
2643+
if (Red->isPartialReduction())
2644+
O << "partial.";
2645+
O << "reduce."
2646+
<< Instruction::getOpcodeName(
2647+
RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()))
2648+
<< " (sub (0, mul";
2649+
auto *Mul = cast<VPWidenRecipe>(BundledRecipes[2]);
2650+
Mul->printFlags(O);
2651+
O << "(";
2652+
getOperand(0)->printAsOperand(O, SlotTracker);
2653+
auto *Ext0 = cast<VPWidenCastRecipe>(BundledRecipes[0]);
2654+
O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
2655+
<< *Ext0->getResultType() << "), (";
2656+
getOperand(1)->printAsOperand(O, SlotTracker);
2657+
auto *Ext1 = cast<VPWidenCastRecipe>(BundledRecipes[1]);
2658+
O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to "
2659+
<< *Ext1->getResultType() << ")";
2660+
if (Red->isConditional()) {
2661+
O << ", ";
2662+
Red->getCondOp()->printAsOperand(O, SlotTracker);
2663+
}
2664+
O << "))";
2665+
break;
2666+
}
26362667
case BundleTypes::MulAccumulateReduction:
26372668
case BundleTypes::ExtMulAccumulateReduction: {
26382669
getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker);

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 20 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2895,7 +2895,8 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
28952895
// Clamp the range if using multiply-accumulate-reduction is profitable.
28962896
auto IsMulAccValidAndClampRange =
28972897
[&](bool IsZExt, VPWidenRecipe *Mul, VPWidenCastRecipe *Ext0,
2898-
VPWidenCastRecipe *Ext1, VPWidenCastRecipe *OuterExt) -> bool {
2898+
VPWidenCastRecipe *Ext1, VPWidenCastRecipe *OuterExt,
2899+
std::optional<VPWidenRecipe *> Sub = std::nullopt) -> bool {
28992900
return LoopVectorizationPlanner::getDecisionAndClampRange(
29002901
[&](ElementCount VF) {
29012902
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
@@ -2906,6 +2907,8 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
29062907
auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy0, VF));
29072908
InstructionCost MulAccCost;
29082909
if (Red->isPartialReduction()) {
2910+
unsigned Opcode =
2911+
Sub.has_value() ? Instruction::Sub : Instruction::Add;
29092912
TargetTransformInfo::PartialReductionExtendKind Ext0Kind =
29102913
Ext0 ? TargetTransformInfo::getPartialReductionExtendKind(
29112914
Ext0->getOpcode())
@@ -2941,21 +2944,27 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
29412944
};
29422945

29432946
VPValue *VecOp = Red->getVecOp();
2944-
VPValue *Mul = VecOp;
2947+
VPValue *Mul = nullptr;
2948+
VPValue *Sub = nullptr;
29452949
VPValue *A, *B;
29462950
// Some chained partial reductions used for complex numbers will have a
29472951
// negation between the mul and reduction. This extracts the mul from that
29482952
// pattern to use it for further checking. The sub should still be bundled.
2949-
if (Red->isPartialReduction())
2950-
match(Mul, m_Binary<Instruction::Sub>(m_SpecificInt(0), m_VPValue(Mul)));
2953+
if (match(VecOp,
2954+
m_Binary<Instruction::Sub>(m_SpecificInt(0), m_VPValue(Mul))))
2955+
Sub = VecOp;
2956+
else
2957+
Mul = VecOp;
29512958
// Try to match reduce.add(mul(...)).
29522959
if (match(Mul, m_Mul(m_VPValue(A), m_VPValue(B)))) {
29532960
auto *RecipeA =
29542961
dyn_cast_if_present<VPWidenCastRecipe>(A->getDefiningRecipe());
29552962
auto *RecipeB =
29562963
dyn_cast_if_present<VPWidenCastRecipe>(B->getDefiningRecipe());
29572964
auto *MulR = cast<VPWidenRecipe>(Mul->getDefiningRecipe());
2958-
auto *VecOpR = cast<VPWidenRecipe>(VecOp->getDefiningRecipe());
2965+
std::optional<VPWidenRecipe *> SubR =
2966+
Sub ? std::make_optional(cast<VPWidenRecipe>(Sub->getDefiningRecipe()))
2967+
: std::nullopt;
29592968

29602969
// Match reduce.add(mul(ext, ext)).
29612970
// Mixed extensions are valid for partial reductions
@@ -2966,12 +2975,12 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
29662975
match(RecipeB, m_ZExtOrSExt(m_VPValue())) &&
29672976
IsMulAccValidAndClampRange(RecipeA->getOpcode() ==
29682977
Instruction::CastOps::ZExt,
2969-
MulR, RecipeA, RecipeB, nullptr)) {
2970-
// If the vector operand is the same as the mul then there was no
2971-
// intervening sub
2972-
if (VecOpR == MulR)
2973-
return new VPSingleDefBundleRecipe(RecipeA, RecipeB, MulR, Red);
2974-
return new VPSingleDefBundleRecipe(RecipeA, RecipeB, MulR, VecOpR, Red);
2978+
MulR, RecipeA, RecipeB, nullptr, SubR)) {
2979+
if (Sub)
2980+
return new VPSingleDefBundleRecipe(
2981+
RecipeA, RecipeB, MulR,
2982+
cast<VPWidenRecipe>(Sub->getDefiningRecipe()), Red);
2983+
return new VPSingleDefBundleRecipe(RecipeA, RecipeB, MulR, Red);
29752984
}
29762985
// Match reduce.add(mul).
29772986
if (IsMulAccValidAndClampRange(true, MulR, nullptr, nullptr, nullptr))

llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll

Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,152 @@ for.exit: ; preds = %for.body
151151
ret i32 %add
152152
}
153153

154+
define i32 @dotp_sub(ptr %a, ptr %b) #0 {
155+
; CHECK-INTERLEAVE1-LABEL: define i32 @dotp_sub(
156+
; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
157+
; CHECK-INTERLEAVE1-NEXT: entry:
158+
; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
159+
; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
160+
; CHECK-INTERLEAVE1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
161+
; CHECK-INTERLEAVE1: vector.ph:
162+
; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
163+
; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
164+
; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
165+
; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
166+
; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
167+
; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
168+
; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
169+
; CHECK-INTERLEAVE1: vector.body:
170+
; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
171+
; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
172+
; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
173+
; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
174+
; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
175+
; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
176+
; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
177+
; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP9]], i32 0
178+
; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
179+
; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
180+
; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = mul <vscale x 4 x i32> [[TMP11]], [[TMP8]]
181+
; CHECK-INTERLEAVE1-NEXT: [[TMP13]] = sub <vscale x 4 x i32> [[VEC_PHI]], [[TMP12]]
182+
; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
183+
; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
184+
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
185+
; CHECK-INTERLEAVE1: middle.block:
186+
; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP13]])
187+
; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
188+
; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
189+
; CHECK-INTERLEAVE1: scalar.ph:
190+
;
191+
; CHECK-INTERLEAVED-LABEL: define i32 @dotp_sub(
192+
; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
193+
; CHECK-INTERLEAVED-NEXT: entry:
194+
; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
195+
; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
196+
; CHECK-INTERLEAVED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
197+
; CHECK-INTERLEAVED: vector.ph:
198+
; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
199+
; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
200+
; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
201+
; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
202+
; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
203+
; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8
204+
; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
205+
; CHECK-INTERLEAVED: vector.body:
206+
; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
207+
; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ]
208+
; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
209+
; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
210+
; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
211+
; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
212+
; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 4
213+
; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP6]], i64 [[TMP9]]
214+
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
215+
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
216+
; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
217+
; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
218+
; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
219+
; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[TMP13]], i32 0
220+
; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
221+
; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = mul nuw i64 [[TMP15]], 4
222+
; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP13]], i64 [[TMP16]]
223+
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i8>, ptr [[TMP14]], align 1
224+
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i8>, ptr [[TMP17]], align 1
225+
; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD3]] to <vscale x 4 x i32>
226+
; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD4]] to <vscale x 4 x i32>
227+
; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = mul <vscale x 4 x i32> [[TMP18]], [[TMP11]]
228+
; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = mul <vscale x 4 x i32> [[TMP19]], [[TMP12]]
229+
; CHECK-INTERLEAVED-NEXT: [[TMP22]] = sub <vscale x 4 x i32> [[VEC_PHI]], [[TMP20]]
230+
; CHECK-INTERLEAVED-NEXT: [[TMP23]] = sub <vscale x 4 x i32> [[VEC_PHI1]], [[TMP21]]
231+
; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
232+
; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
233+
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
234+
; CHECK-INTERLEAVED: middle.block:
235+
; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[TMP23]], [[TMP22]]
236+
; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
237+
; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
238+
; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
239+
; CHECK-INTERLEAVED: scalar.ph:
240+
;
241+
; CHECK-MAXBW-LABEL: define i32 @dotp_sub(
242+
; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
243+
; CHECK-MAXBW-NEXT: entry:
244+
; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
245+
; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16
246+
; CHECK-MAXBW-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
247+
; CHECK-MAXBW: vector.ph:
248+
; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
249+
; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
250+
; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
251+
; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
252+
; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
253+
; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 16
254+
; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
255+
; CHECK-MAXBW: vector.body:
256+
; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
257+
; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
258+
; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
259+
; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
260+
; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP7]], align 1
261+
; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
262+
; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP8]], i32 0
263+
; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 16 x i8>, ptr [[TMP9]], align 1
264+
; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD1]] to <vscale x 16 x i32>
265+
; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
266+
; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = mul <vscale x 16 x i32> [[TMP10]], [[TMP11]]
267+
; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[TMP12]]
268+
; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP13]])
269+
; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
270+
; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
271+
; CHECK-MAXBW-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
272+
; CHECK-MAXBW: middle.block:
273+
; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE]])
274+
; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
275+
; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
276+
; CHECK-MAXBW: scalar.ph:
277+
;
278+
entry:
279+
br label %for.body
280+
281+
for.body: ; preds = %for.body, %entry
282+
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
283+
%accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
284+
%gep.a = getelementptr i8, ptr %a, i64 %iv
285+
%load.a = load i8, ptr %gep.a, align 1
286+
%ext.a = zext i8 %load.a to i32
287+
%gep.b = getelementptr i8, ptr %b, i64 %iv
288+
%load.b = load i8, ptr %gep.b, align 1
289+
%ext.b = zext i8 %load.b to i32
290+
%mul = mul i32 %ext.b, %ext.a
291+
%add = sub i32 %accum, %mul
292+
%iv.next = add i64 %iv, 1
293+
%exitcond.not = icmp eq i64 %iv.next, 1024
294+
br i1 %exitcond.not, label %for.exit, label %for.body
295+
296+
for.exit: ; preds = %for.body
297+
ret i32 %add
298+
}
299+
154300
define i64 @not_dotp_i8_to_i64_has_neon_dotprod(ptr readonly %a, ptr readonly %b) #1 {
155301
; CHECK-INTERLEAVE1-LABEL: define i64 @not_dotp_i8_to_i64_has_neon_dotprod(
156302
; CHECK-INTERLEAVE1-SAME: ptr readonly [[A:%.*]], ptr readonly [[B:%.*]]) #[[ATTR1:[0-9]+]] {

0 commit comments

Comments
 (0)