Skip to content

Commit 10c4727

Browse files
committed
[LoopVectorizer] Bundle partial reductions with different extensions
This PR adds support for extensions of different signedness to VPMulAccumulateReductionRecipe and allows such partial reductions to be bundled into that class.
1 parent 700d0d6 commit 10c4727

File tree

5 files changed

+99
-80
lines changed

5 files changed

+99
-80
lines changed

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 27 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -2493,11 +2493,13 @@ class VPExtendedReductionRecipe : public VPReductionRecipe {
24932493
/// recipe is abstract and needs to be lowered to concrete recipes before
24942494
/// codegen. The Operands are {ChainOp, VecOp1, VecOp2, [Condition]}.
24952495
class VPMulAccumulateReductionRecipe : public VPReductionRecipe {
2496-
/// Opcode of the extend recipe.
2497-
Instruction::CastOps ExtOp;
2496+
/// Opcodes of the extend recipes.
2497+
Instruction::CastOps ExtOp0;
2498+
Instruction::CastOps ExtOp1;
24982499

2499-
/// Non-neg flag of the extend recipe.
2500-
bool IsNonNeg = false;
2500+
/// Non-neg flags of the extend recipe.
2501+
bool IsNonNeg0 = false;
2502+
bool IsNonNeg1 = false;
25012503

25022504
Type *ResultTy;
25032505

@@ -2512,7 +2514,8 @@ class VPMulAccumulateReductionRecipe : public VPReductionRecipe {
25122514
MulAcc->getCondOp(), MulAcc->isOrdered(),
25132515
WrapFlagsTy(MulAcc->hasNoUnsignedWrap(), MulAcc->hasNoSignedWrap()),
25142516
MulAcc->getDebugLoc()),
2515-
ExtOp(MulAcc->getExtOpcode()), IsNonNeg(MulAcc->isNonNeg()),
2517+
ExtOp0(MulAcc->getExt0Opcode()), ExtOp1(MulAcc->getExt1Opcode()),
2518+
IsNonNeg0(MulAcc->isNonNeg0()), IsNonNeg1(MulAcc->isNonNeg1()),
25162519
ResultTy(MulAcc->getResultType()),
25172520
IsPartialReduction(MulAcc->isPartialReduction()) {}
25182521

@@ -2526,7 +2529,8 @@ class VPMulAccumulateReductionRecipe : public VPReductionRecipe {
25262529
R->getCondOp(), R->isOrdered(),
25272530
WrapFlagsTy(Mul->hasNoUnsignedWrap(), Mul->hasNoSignedWrap()),
25282531
R->getDebugLoc()),
2529-
ExtOp(Ext0->getOpcode()), IsNonNeg(Ext0->isNonNeg()),
2532+
ExtOp0(Ext0->getOpcode()), ExtOp1(Ext1->getOpcode()),
2533+
IsNonNeg0(Ext0->isNonNeg()), IsNonNeg1(Ext1->isNonNeg()),
25302534
ResultTy(ResultTy),
25312535
IsPartialReduction(isa<VPPartialReductionRecipe>(R)) {
25322536
assert(RecurrenceDescriptor::getOpcode(getRecurrenceKind()) ==
@@ -2542,7 +2546,8 @@ class VPMulAccumulateReductionRecipe : public VPReductionRecipe {
25422546
R->getCondOp(), R->isOrdered(),
25432547
WrapFlagsTy(Mul->hasNoUnsignedWrap(), Mul->hasNoSignedWrap()),
25442548
R->getDebugLoc()),
2545-
ExtOp(Instruction::CastOps::CastOpsEnd) {
2549+
ExtOp0(Instruction::CastOps::CastOpsEnd),
2550+
ExtOp1(Instruction::CastOps::CastOpsEnd) {
25462551
assert(RecurrenceDescriptor::getOpcode(getRecurrenceKind()) ==
25472552
Instruction::Add &&
25482553
"The reduction instruction in MulAccumulateReductionRecipe must be "
@@ -2586,19 +2591,26 @@ class VPMulAccumulateReductionRecipe : public VPReductionRecipe {
25862591
VPValue *getVecOp1() const { return getOperand(2); }
25872592

25882593
/// Return if this MulAcc recipe contains extend instructions.
2589-
bool isExtended() const { return ExtOp != Instruction::CastOps::CastOpsEnd; }
2594+
bool isExtended() const { return ExtOp0 != Instruction::CastOps::CastOpsEnd; }
25902595

25912596
/// Return if the operands of mul instruction come from same extend.
2592-
bool isSameExtend() const { return getVecOp0() == getVecOp1(); }
2597+
bool isSameExtendVal() const { return getVecOp0() == getVecOp1(); }
25932598

2594-
/// Return the opcode of the underlying extend.
2595-
Instruction::CastOps getExtOpcode() const { return ExtOp; }
2599+
/// Return the opcode of the underlying extends.
2600+
Instruction::CastOps getExt0Opcode() const { return ExtOp0; }
2601+
Instruction::CastOps getExt1Opcode() const { return ExtOp1; }
2602+
2603+
/// Return if the first extend's opcode is ZExt.
2604+
bool isZExt0() const { return ExtOp0 == Instruction::CastOps::ZExt; }
2605+
2606+
/// Return if the second extend's opcode is ZExt.
2607+
bool isZExt1() const { return ExtOp1 == Instruction::CastOps::ZExt; }
25962608

2597-
/// Return if the extend opcode is ZExt.
2598-
bool isZExt() const { return ExtOp == Instruction::CastOps::ZExt; }
2609+
/// Return the non negative flag of the first ext recipe.
2610+
bool isNonNeg0() const { return IsNonNeg0; }
25992611

2600-
/// Return the non negative flag of the ext recipe.
2601-
bool isNonNeg() const { return IsNonNeg; }
2612+
/// Return the non negative flag of the second ext recipe.
2613+
bool isNonNeg1() const { return IsNonNeg1; }
26022614

26032615
/// Return if the underlying reduction recipe is a partial reduction.
26042616
bool isPartialReduction() const { return IsPartialReduction; }

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2438,14 +2438,14 @@ VPMulAccumulateReductionRecipe::computeCost(ElementCount VF,
24382438
return Ctx.TTI.getPartialReductionCost(
24392439
Instruction::Add, Ctx.Types.inferScalarType(getVecOp0()),
24402440
Ctx.Types.inferScalarType(getVecOp1()), getResultType(), VF,
2441-
TTI::getPartialReductionExtendKind(getExtOpcode()),
2442-
TTI::getPartialReductionExtendKind(getExtOpcode()), Instruction::Mul);
2441+
TTI::getPartialReductionExtendKind(getExt0Opcode()),
2442+
TTI::getPartialReductionExtendKind(getExt1Opcode()), Instruction::Mul);
24432443
}
24442444

24452445
Type *RedTy = Ctx.Types.inferScalarType(this);
24462446
auto *SrcVecTy =
24472447
cast<VectorType>(toVectorTy(Ctx.Types.inferScalarType(getVecOp0()), VF));
2448-
return Ctx.TTI.getMulAccReductionCost(isZExt(), RedTy, SrcVecTy,
2448+
return Ctx.TTI.getMulAccReductionCost(isZExt0(), RedTy, SrcVecTy,
24492449
Ctx.CostKind);
24502450
}
24512451

@@ -2530,13 +2530,24 @@ void VPMulAccumulateReductionRecipe::print(raw_ostream &O, const Twine &Indent,
25302530
if (isExtended())
25312531
O << "(";
25322532
getVecOp0()->printAsOperand(O, SlotTracker);
2533-
if (isExtended())
2534-
O << " extended to " << *getResultType() << "), (";
2535-
else
2533+
if (isExtended()) {
2534+
O << " ";
2535+
if (isZExt0())
2536+
O << "zero-";
2537+
else
2538+
O << "sign-";
2539+
O << "extended to " << *getResultType() << "), (";
2540+
} else
25362541
O << ", ";
25372542
getVecOp1()->printAsOperand(O, SlotTracker);
2538-
if (isExtended())
2539-
O << " extended to " << *getResultType() << ")";
2543+
if (isExtended()) {
2544+
O << " ";
2545+
if (isZExt1())
2546+
O << "zero-";
2547+
else
2548+
O << "sign-";
2549+
O << "extended to " << *getResultType() << ")";
2550+
}
25402551
if (isConditional()) {
25412552
O << ", ";
25422553
getCondOp()->printAsOperand(O, SlotTracker);

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2121,26 +2121,27 @@ expandVPMulAccumulateReduction(VPMulAccumulateReductionRecipe *MulAcc) {
21212121
VPValue *Op0, *Op1;
21222122
if (MulAcc->isExtended()) {
21232123
Type *RedTy = MulAcc->getResultType();
2124-
if (MulAcc->isZExt())
2125-
Op0 = new VPWidenCastRecipe(MulAcc->getExtOpcode(), MulAcc->getVecOp0(),
2126-
RedTy, MulAcc->isNonNeg(),
2124+
if (MulAcc->isZExt0())
2125+
Op0 = new VPWidenCastRecipe(MulAcc->getExt0Opcode(), MulAcc->getVecOp0(),
2126+
RedTy, MulAcc->isNonNeg0(),
21272127
MulAcc->getDebugLoc());
21282128
else
2129-
Op0 = new VPWidenCastRecipe(MulAcc->getExtOpcode(), MulAcc->getVecOp0(),
2129+
Op0 = new VPWidenCastRecipe(MulAcc->getExt0Opcode(), MulAcc->getVecOp0(),
21302130
RedTy, MulAcc->getDebugLoc());
21312131
Op0->getDefiningRecipe()->insertBefore(MulAcc);
21322132
// Prevent reduce.add(mul(ext(A), ext(A))) generate duplicate
21332133
// VPWidenCastRecipe.
21342134
if (MulAcc->getVecOp0() == MulAcc->getVecOp1()) {
21352135
Op1 = Op0;
21362136
} else {
2137-
if (MulAcc->isZExt())
2138-
Op1 = new VPWidenCastRecipe(MulAcc->getExtOpcode(), MulAcc->getVecOp1(),
2139-
RedTy, MulAcc->isNonNeg(),
2140-
MulAcc->getDebugLoc());
2137+
if (MulAcc->isZExt1())
2138+
Op1 = new VPWidenCastRecipe(MulAcc->getExt1Opcode(),
2139+
MulAcc->getVecOp1(), RedTy,
2140+
MulAcc->isNonNeg1(), MulAcc->getDebugLoc());
21412141
else
2142-
Op1 = new VPWidenCastRecipe(MulAcc->getExtOpcode(), MulAcc->getVecOp1(),
2143-
RedTy, MulAcc->getDebugLoc());
2142+
Op1 =
2143+
new VPWidenCastRecipe(MulAcc->getExt1Opcode(), MulAcc->getVecOp1(),
2144+
RedTy, MulAcc->getDebugLoc());
21442145
Op1->getDefiningRecipe()->insertBefore(MulAcc);
21452146
}
21462147
} else {
@@ -2451,10 +2452,8 @@ tryToCreateAbstractPartialReductionRecipe(VPPartialReductionRecipe *PRed) {
24512452

24522453
auto *Ext0 = dyn_cast<VPWidenCastRecipe>(BinOp->getOperand(0));
24532454
auto *Ext1 = dyn_cast<VPWidenCastRecipe>(BinOp->getOperand(1));
2454-
// TODO: Make work with extends of different signedness
24552455
if (!Ext0 || Ext0->hasMoreThanOneUniqueUser() || !Ext1 ||
2456-
Ext1->hasMoreThanOneUniqueUser() ||
2457-
Ext0->getOpcode() != Ext1->getOpcode())
2456+
Ext1->hasMoreThanOneUniqueUser())
24582457
return;
24592458

24602459
auto *AbstractR = new VPMulAccumulateReductionRecipe(PRed, BinOp, Ext0, Ext1,

llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll

Lines changed: 28 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -22,19 +22,19 @@ define i32 @dotp_z_s(ptr %a, ptr %b) #0 {
2222
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16
2323
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
2424
; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
25-
; CHECK-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
26-
; CHECK-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
2725
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
2826
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
2927
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP6]], i32 16
3028
; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
3129
; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
32-
; CHECK-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
33-
; CHECK-NEXT: [[TMP10:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
34-
; CHECK-NEXT: [[TMP11:%.*]] = mul <16 x i32> [[TMP9]], [[TMP4]]
30+
; CHECK-NEXT: [[TMP10:%.*]] = sext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
31+
; CHECK-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
3532
; CHECK-NEXT: [[TMP12:%.*]] = mul <16 x i32> [[TMP10]], [[TMP5]]
36-
; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]])
37-
; CHECK-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP12]])
33+
; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP12]])
34+
; CHECK-NEXT: [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
35+
; CHECK-NEXT: [[TMP11:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
36+
; CHECK-NEXT: [[TMP16:%.*]] = mul <16 x i32> [[TMP15]], [[TMP11]]
37+
; CHECK-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP16]])
3838
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
3939
; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
4040
; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -60,19 +60,19 @@ define i32 @dotp_z_s(ptr %a, ptr %b) #0 {
6060
; CHECK-NOI8MM-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16
6161
; CHECK-NOI8MM-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
6262
; CHECK-NOI8MM-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
63-
; CHECK-NOI8MM-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
64-
; CHECK-NOI8MM-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
6563
; CHECK-NOI8MM-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
6664
; CHECK-NOI8MM-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
6765
; CHECK-NOI8MM-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP6]], i32 16
6866
; CHECK-NOI8MM-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
6967
; CHECK-NOI8MM-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
70-
; CHECK-NOI8MM-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
71-
; CHECK-NOI8MM-NEXT: [[TMP10:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
72-
; CHECK-NOI8MM-NEXT: [[TMP11:%.*]] = mul <16 x i32> [[TMP9]], [[TMP4]]
68+
; CHECK-NOI8MM-NEXT: [[TMP10:%.*]] = sext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
69+
; CHECK-NOI8MM-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
7370
; CHECK-NOI8MM-NEXT: [[TMP12:%.*]] = mul <16 x i32> [[TMP10]], [[TMP5]]
74-
; CHECK-NOI8MM-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]])
75-
; CHECK-NOI8MM-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP12]])
71+
; CHECK-NOI8MM-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP12]])
72+
; CHECK-NOI8MM-NEXT: [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
73+
; CHECK-NOI8MM-NEXT: [[TMP11:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
74+
; CHECK-NOI8MM-NEXT: [[TMP16:%.*]] = mul <16 x i32> [[TMP15]], [[TMP11]]
75+
; CHECK-NOI8MM-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP16]])
7676
; CHECK-NOI8MM-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
7777
; CHECK-NOI8MM-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
7878
; CHECK-NOI8MM-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -121,19 +121,19 @@ define i32 @dotp_s_z(ptr %a, ptr %b) #0 {
121121
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16
122122
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
123123
; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
124-
; CHECK-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
125-
; CHECK-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
126124
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
127125
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
128126
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP6]], i32 16
129127
; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
130128
; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
131-
; CHECK-NEXT: [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
132-
; CHECK-NEXT: [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
133-
; CHECK-NEXT: [[TMP11:%.*]] = mul <16 x i32> [[TMP9]], [[TMP4]]
129+
; CHECK-NEXT: [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
130+
; CHECK-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
134131
; CHECK-NEXT: [[TMP12:%.*]] = mul <16 x i32> [[TMP10]], [[TMP5]]
135-
; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]])
136-
; CHECK-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP12]])
132+
; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP12]])
133+
; CHECK-NEXT: [[TMP15:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
134+
; CHECK-NEXT: [[TMP11:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
135+
; CHECK-NEXT: [[TMP16:%.*]] = mul <16 x i32> [[TMP15]], [[TMP11]]
136+
; CHECK-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP16]])
137137
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
138138
; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
139139
; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -159,19 +159,19 @@ define i32 @dotp_s_z(ptr %a, ptr %b) #0 {
159159
; CHECK-NOI8MM-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16
160160
; CHECK-NOI8MM-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
161161
; CHECK-NOI8MM-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
162-
; CHECK-NOI8MM-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
163-
; CHECK-NOI8MM-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
164162
; CHECK-NOI8MM-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
165163
; CHECK-NOI8MM-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
166164
; CHECK-NOI8MM-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP6]], i32 16
167165
; CHECK-NOI8MM-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
168166
; CHECK-NOI8MM-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
169-
; CHECK-NOI8MM-NEXT: [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
170-
; CHECK-NOI8MM-NEXT: [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
171-
; CHECK-NOI8MM-NEXT: [[TMP11:%.*]] = mul <16 x i32> [[TMP9]], [[TMP4]]
167+
; CHECK-NOI8MM-NEXT: [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
168+
; CHECK-NOI8MM-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
172169
; CHECK-NOI8MM-NEXT: [[TMP12:%.*]] = mul <16 x i32> [[TMP10]], [[TMP5]]
173-
; CHECK-NOI8MM-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]])
174-
; CHECK-NOI8MM-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP12]])
170+
; CHECK-NOI8MM-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP12]])
171+
; CHECK-NOI8MM-NEXT: [[TMP15:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
172+
; CHECK-NOI8MM-NEXT: [[TMP11:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
173+
; CHECK-NOI8MM-NEXT: [[TMP16:%.*]] = mul <16 x i32> [[TMP15]], [[TMP11]]
174+
; CHECK-NOI8MM-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP16]])
175175
; CHECK-NOI8MM-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
176176
; CHECK-NOI8MM-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
177177
; CHECK-NOI8MM-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]

0 commit comments

Comments
 (0)