Skip to content

Commit dd60663

Browse files
committed
[SLP] Emit reduction instead of 2 extracts + scalar op, when vectorizing operands (#147583)
Added emission of the 2-element reduction instead of 2 extracts + scalar op, when trying to vectorize operands of the instruction, if it is more profitable.
1 parent c452de1 commit dd60663

File tree

7 files changed

+170
-81
lines changed

7 files changed

+170
-81
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 82 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -21722,6 +21722,8 @@ class HorizontalReduction {
2172221722
/// Checks if the optimization of original scalar identity operations on
2172321723
/// matched horizontal reductions is enabled and allowed.
2172421724
bool IsSupportedHorRdxIdentityOp = false;
21725+
/// The minimum number of the reduced values.
21726+
const unsigned ReductionLimit = VectorizeNonPowerOf2 ? 3 : 4;
2172521727
/// Contains vector values for reduction including their scale factor and
2172621728
/// signedness.
2172721729
SmallVector<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales;
@@ -21740,7 +21742,8 @@ class HorizontalReduction {
2174021742
}
2174121743

2174221744
/// Checks if instruction is associative and can be vectorized.
21743-
static bool isVectorizable(RecurKind Kind, Instruction *I) {
21745+
static bool isVectorizable(RecurKind Kind, Instruction *I,
21746+
bool TwoElementReduction = false) {
2174421747
if (Kind == RecurKind::None)
2174521748
return false;
2174621749

@@ -21749,6 +21752,10 @@ class HorizontalReduction {
2174921752
isBoolLogicOp(I))
2175021753
return true;
2175121754

21755+
// No need to check for associativity, if 2 reduced values.
21756+
if (TwoElementReduction)
21757+
return true;
21758+
2175221759
if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
2175321760
// FP min/max are associative except for NaN and -0.0. We do not
2175421761
// have to rule out -0.0 here because the intrinsic semantics do not
@@ -22020,6 +22027,27 @@ class HorizontalReduction {
2202022027

2202122028
public:
2202222029
HorizontalReduction() = default;
22030+
HorizontalReduction(Instruction *I, ArrayRef<Value *> Ops)
22031+
: ReductionRoot(I), ReductionLimit(2) {
22032+
RdxKind = HorizontalReduction::getRdxKind(I);
22033+
ReductionOps.emplace_back().push_back(I);
22034+
ReducedVals.emplace_back().assign(Ops.begin(), Ops.end());
22035+
for (Value *V : Ops)
22036+
ReducedValsToOps[V].push_back(I);
22037+
}
22038+
22039+
bool matchReductionForOperands() const {
22040+
// Analyze "regular" integer/FP types for reductions - no target-specific
22041+
// types or pointers.
22042+
assert(ReductionRoot && "Reduction root is not set!");
22043+
if (!isVectorizable(RdxKind, cast<Instruction>(ReductionRoot),
22044+
all_of(ReducedVals, [](ArrayRef<Value *> Ops) {
22045+
return Ops.size() == 2;
22046+
})))
22047+
return false;
22048+
22049+
return true;
22050+
}
2202322051

2202422052
/// Try to find a reduction tree.
2202522053
bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
@@ -22187,7 +22215,6 @@ class HorizontalReduction {
2218722215
/// Attempt to vectorize the tree found by matchAssociativeReduction.
2218822216
Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
2218922217
const TargetLibraryInfo &TLI, AssumptionCache *AC) {
22190-
const unsigned ReductionLimit = VectorizeNonPowerOf2 ? 3 : 4;
2219122218
constexpr unsigned RegMaxNumber = 4;
2219222219
constexpr unsigned RedValsMaxNumber = 128;
2219322220
// If there are a sufficient number of reduction values, reduce
@@ -22521,8 +22548,10 @@ class HorizontalReduction {
2252122548
continue;
2252222549
}
2252322550
V.reorderTopToBottom();
22524-
// No need to reorder the root node at all.
22525-
V.reorderBottomToTop(/*IgnoreReorder=*/true);
22551+
// No need to reorder the root node at all for reassociative reduction.
22552+
V.reorderBottomToTop(/*IgnoreReorder=*/RdxFMF.allowReassoc() ||
22553+
VL.front()->getType()->isIntOrIntVectorTy() ||
22554+
ReductionLimit > 2);
2252622555
// Keep extracted other reduction values, if they are used in the
2252722556
// vectorization trees.
2252822557
BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(
@@ -23736,15 +23765,61 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
2373623765
Candidates.emplace_back(A1, B);
2373723766
}
2373823767

23768+
auto TryToReduce = [this, &R, &TTI = *TTI](Instruction *Inst,
23769+
ArrayRef<Value *> Ops) {
23770+
if (!isReductionCandidate(Inst))
23771+
return false;
23772+
Type *Ty = Inst->getType();
23773+
if (!isValidElementType(Ty) || Ty->isPointerTy())
23774+
return false;
23775+
HorizontalReduction HorRdx(Inst, Ops);
23776+
if (!HorRdx.matchReductionForOperands())
23777+
return false;
23778+
// Check the cost of operations.
23779+
VectorType *VecTy = getWidenedType(Ty, Ops.size());
23780+
constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
23781+
InstructionCost ScalarCost =
23782+
TTI.getScalarizationOverhead(
23783+
VecTy, APInt::getAllOnes(getNumElements(VecTy)), /*Insert=*/false,
23784+
/*Extract=*/true, CostKind) +
23785+
TTI.getInstructionCost(Inst, CostKind);
23786+
InstructionCost RedCost;
23787+
switch (::getRdxKind(Inst)) {
23788+
case RecurKind::Add:
23789+
case RecurKind::Mul:
23790+
case RecurKind::Or:
23791+
case RecurKind::And:
23792+
case RecurKind::Xor:
23793+
case RecurKind::FAdd:
23794+
case RecurKind::FMul: {
23795+
FastMathFlags FMF;
23796+
if (auto *FPCI = dyn_cast<FPMathOperator>(Inst))
23797+
FMF = FPCI->getFastMathFlags();
23798+
RedCost = TTI.getArithmeticReductionCost(Inst->getOpcode(), VecTy, FMF,
23799+
CostKind);
23800+
break;
23801+
}
23802+
default:
23803+
return false;
23804+
}
23805+
if (RedCost >= ScalarCost)
23806+
return false;
23807+
23808+
return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC) != nullptr;
23809+
};
2373923810
if (Candidates.size() == 1)
23740-
return tryToVectorizeList({Op0, Op1}, R);
23811+
return TryToReduce(I, {Op0, Op1}) || tryToVectorizeList({Op0, Op1}, R);
2374123812

2374223813
// We have multiple options. Try to pick the single best.
2374323814
std::optional<int> BestCandidate = R.findBestRootPair(Candidates);
2374423815
if (!BestCandidate)
2374523816
return false;
23746-
return tryToVectorizeList(
23747-
{Candidates[*BestCandidate].first, Candidates[*BestCandidate].second}, R);
23817+
return (*BestCandidate == 0 &&
23818+
TryToReduce(I, {Candidates[*BestCandidate].first,
23819+
Candidates[*BestCandidate].second})) ||
23820+
tryToVectorizeList({Candidates[*BestCandidate].first,
23821+
Candidates[*BestCandidate].second},
23822+
R);
2374823823
}
2374923824

2375023825
bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root,

llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,7 @@ define void @test1(ptr nocapture readonly %J, i32 %xmin, i32 %ymin) {
1616
; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr [[J:%.*]], align 4
1717
; CHECK-NEXT: [[TMP5:%.*]] = fsub fast <2 x float> [[TMP2]], [[TMP4]]
1818
; CHECK-NEXT: [[TMP6:%.*]] = fmul fast <2 x float> [[TMP5]], [[TMP5]]
19-
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[TMP6]], i32 0
20-
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[TMP6]], i32 1
21-
; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP7]], [[TMP8]]
19+
; CHECK-NEXT: [[ADD:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP6]])
2220
; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq float [[ADD]], 0.000000e+00
2321
; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY3_LR_PH]], label [[FOR_END27:%.*]]
2422
; CHECK: for.end27:
@@ -57,9 +55,7 @@ define void @test2(ptr nocapture readonly %J, i32 %xmin, i32 %ymin) {
5755
; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr [[J:%.*]], align 4
5856
; CHECK-NEXT: [[TMP5:%.*]] = fsub fast <2 x float> [[TMP2]], [[TMP4]]
5957
; CHECK-NEXT: [[TMP6:%.*]] = fmul fast <2 x float> [[TMP5]], [[TMP5]]
60-
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[TMP6]], i32 0
61-
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[TMP6]], i32 1
62-
; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP8]], [[TMP7]]
58+
; CHECK-NEXT: [[ADD:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP6]])
6359
; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq float [[ADD]], 0.000000e+00
6460
; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY3_LR_PH]], label [[FOR_END27:%.*]]
6561
; CHECK: for.end27:

llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll

Lines changed: 16 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,19 @@
33
; RUN: opt < %s -S -passes=slp-vectorizer -mtriple=aarch64-unknown-linux -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK,FULLFP16
44

55
define half @reduce_fast_half2(<2 x half> %vec2) {
6-
; CHECK-LABEL: define half @reduce_fast_half2(
7-
; CHECK-SAME: <2 x half> [[VEC2:%.*]]) #[[ATTR0:[0-9]+]] {
8-
; CHECK-NEXT: [[ENTRY:.*:]]
9-
; CHECK-NEXT: [[ELT0:%.*]] = extractelement <2 x half> [[VEC2]], i64 0
10-
; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x half> [[VEC2]], i64 1
11-
; CHECK-NEXT: [[ADD1:%.*]] = fadd fast half [[ELT1]], [[ELT0]]
12-
; CHECK-NEXT: ret half [[ADD1]]
6+
; NOFP16-LABEL: define half @reduce_fast_half2(
7+
; NOFP16-SAME: <2 x half> [[VEC2:%.*]]) #[[ATTR0:[0-9]+]] {
8+
; NOFP16-NEXT: [[ENTRY:.*:]]
9+
; NOFP16-NEXT: [[ELT0:%.*]] = extractelement <2 x half> [[VEC2]], i64 0
10+
; NOFP16-NEXT: [[ELT1:%.*]] = extractelement <2 x half> [[VEC2]], i64 1
11+
; NOFP16-NEXT: [[ADD1:%.*]] = fadd fast half [[ELT1]], [[ELT0]]
12+
; NOFP16-NEXT: ret half [[ADD1]]
13+
;
14+
; FULLFP16-LABEL: define half @reduce_fast_half2(
15+
; FULLFP16-SAME: <2 x half> [[VEC2:%.*]]) #[[ATTR0:[0-9]+]] {
16+
; FULLFP16-NEXT: [[ENTRY:.*:]]
17+
; FULLFP16-NEXT: [[TMP0:%.*]] = call fast half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> [[VEC2]])
18+
; FULLFP16-NEXT: ret half [[TMP0]]
1319
;
1420
entry:
1521
%elt0 = extractelement <2 x half> %vec2, i64 0
@@ -20,7 +26,7 @@ entry:
2026

2127
define half @reduce_half2(<2 x half> %vec2) {
2228
; CHECK-LABEL: define half @reduce_half2(
23-
; CHECK-SAME: <2 x half> [[VEC2:%.*]]) #[[ATTR0]] {
29+
; CHECK-SAME: <2 x half> [[VEC2:%.*]]) #[[ATTR0:[0-9]+]] {
2430
; CHECK-NEXT: [[ENTRY:.*:]]
2531
; CHECK-NEXT: [[ELT0:%.*]] = extractelement <2 x half> [[VEC2]], i64 0
2632
; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x half> [[VEC2]], i64 1
@@ -269,9 +275,7 @@ define float @reduce_fast_float2(<2 x float> %vec2) {
269275
; CHECK-LABEL: define float @reduce_fast_float2(
270276
; CHECK-SAME: <2 x float> [[VEC2:%.*]]) #[[ATTR0]] {
271277
; CHECK-NEXT: [[ENTRY:.*:]]
272-
; CHECK-NEXT: [[ELT0:%.*]] = extractelement <2 x float> [[VEC2]], i64 0
273-
; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[VEC2]], i64 1
274-
; CHECK-NEXT: [[ADD1:%.*]] = fadd fast float [[ELT1]], [[ELT0]]
278+
; CHECK-NEXT: [[ADD1:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[VEC2]])
275279
; CHECK-NEXT: ret float [[ADD1]]
276280
;
277281
entry:
@@ -409,9 +413,7 @@ define double @reduce_fast_double2(<2 x double> %vec2) {
409413
; CHECK-LABEL: define double @reduce_fast_double2(
410414
; CHECK-SAME: <2 x double> [[VEC2:%.*]]) #[[ATTR0]] {
411415
; CHECK-NEXT: [[ENTRY:.*:]]
412-
; CHECK-NEXT: [[ELT0:%.*]] = extractelement <2 x double> [[VEC2]], i64 0
413-
; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x double> [[VEC2]], i64 1
414-
; CHECK-NEXT: [[ADD1:%.*]] = fadd fast double [[ELT1]], [[ELT0]]
416+
; CHECK-NEXT: [[ADD1:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> [[VEC2]])
415417
; CHECK-NEXT: ret double [[ADD1]]
416418
;
417419
entry:

llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -156,10 +156,10 @@ define ptr @test4() {
156156
; POWEROF2-NEXT: [[TMP13:%.*]] = fmul <2 x float> [[TMP12]], zeroinitializer
157157
; POWEROF2-NEXT: [[TMP14:%.*]] = call <2 x float> @llvm.vector.extract.v2f32.v4f32(<4 x float> [[TMP10]], i64 2)
158158
; POWEROF2-NEXT: [[TMP15:%.*]] = fmul <2 x float> zeroinitializer, [[TMP14]]
159-
; POWEROF2-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[TMP9]], i32 0
160-
; POWEROF2-NEXT: [[TMP17:%.*]] = fmul float 0.000000e+00, [[TMP18]]
161-
; POWEROF2-NEXT: [[TMP30:%.*]] = extractelement <2 x float> [[TMP9]], i32 1
162-
; POWEROF2-NEXT: [[TMP19:%.*]] = fmul float [[TMP30]], 0.000000e+00
159+
; POWEROF2-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[TMP9]], i32 0
160+
; POWEROF2-NEXT: [[TMP17:%.*]] = fmul float 0.000000e+00, [[TMP16]]
161+
; POWEROF2-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[TMP9]], i32 1
162+
; POWEROF2-NEXT: [[TMP19:%.*]] = fmul float [[TMP18]], 0.000000e+00
163163
; POWEROF2-NEXT: [[TMP20:%.*]] = extractelement <2 x float> [[TMP13]], i32 0
164164
; POWEROF2-NEXT: [[TMP21:%.*]] = fadd reassoc nsz float [[TMP20]], [[TMP17]]
165165
; POWEROF2-NEXT: [[TMP22:%.*]] = extractelement <2 x float> [[TMP15]], i32 0

llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll

Lines changed: 11 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -513,21 +513,16 @@ define i32 @dot_product_i32(ptr %a, ptr %b) {
513513
;
514514
; POW2-ONLY-LABEL: @dot_product_i32(
515515
; POW2-ONLY-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 0
516-
; POW2-ONLY-NEXT: [[L_A_0:%.*]] = load i32, ptr [[GEP_A_0]], align 4
517-
; POW2-ONLY-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 1
518-
; POW2-ONLY-NEXT: [[L_A_1:%.*]] = load i32, ptr [[GEP_A_1]], align 4
519516
; POW2-ONLY-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 2
520517
; POW2-ONLY-NEXT: [[L_A_2:%.*]] = load i32, ptr [[GEP_A_2]], align 4
521518
; POW2-ONLY-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 0
522-
; POW2-ONLY-NEXT: [[L_B_0:%.*]] = load i32, ptr [[GEP_B_0]], align 4
523-
; POW2-ONLY-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 1
524-
; POW2-ONLY-NEXT: [[L_B_1:%.*]] = load i32, ptr [[GEP_B_1]], align 4
525519
; POW2-ONLY-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 2
526520
; POW2-ONLY-NEXT: [[L_B_2:%.*]] = load i32, ptr [[GEP_B_2]], align 4
527-
; POW2-ONLY-NEXT: [[MUL_0:%.*]] = mul nsw i32 [[L_A_0]], [[L_B_0]]
528-
; POW2-ONLY-NEXT: [[MUL_1:%.*]] = mul nsw i32 [[L_A_1]], [[L_B_1]]
521+
; POW2-ONLY-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[GEP_A_0]], align 4
522+
; POW2-ONLY-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[GEP_B_0]], align 4
523+
; POW2-ONLY-NEXT: [[TMP3:%.*]] = mul nsw <2 x i32> [[TMP1]], [[TMP2]]
529524
; POW2-ONLY-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[L_A_2]], [[L_B_2]]
530-
; POW2-ONLY-NEXT: [[ADD_0:%.*]] = add i32 [[MUL_0]], [[MUL_1]]
525+
; POW2-ONLY-NEXT: [[ADD_0:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[TMP3]])
531526
; POW2-ONLY-NEXT: [[ADD_1:%.*]] = add i32 [[ADD_0]], [[MUL_2]]
532527
; POW2-ONLY-NEXT: ret i32 [[ADD_1]]
533528
;
@@ -568,21 +563,16 @@ define i32 @dot_product_i32_reorder(ptr %a, ptr %b) {
568563
;
569564
; POW2-ONLY-LABEL: @dot_product_i32_reorder(
570565
; POW2-ONLY-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 0
571-
; POW2-ONLY-NEXT: [[L_A_0:%.*]] = load i32, ptr [[GEP_A_0]], align 4
572-
; POW2-ONLY-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 1
573-
; POW2-ONLY-NEXT: [[L_A_1:%.*]] = load i32, ptr [[GEP_A_1]], align 4
574566
; POW2-ONLY-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 2
575567
; POW2-ONLY-NEXT: [[L_A_2:%.*]] = load i32, ptr [[GEP_A_2]], align 4
576568
; POW2-ONLY-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 0
577-
; POW2-ONLY-NEXT: [[L_B_0:%.*]] = load i32, ptr [[GEP_B_0]], align 4
578-
; POW2-ONLY-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 1
579-
; POW2-ONLY-NEXT: [[L_B_1:%.*]] = load i32, ptr [[GEP_B_1]], align 4
580569
; POW2-ONLY-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 2
581570
; POW2-ONLY-NEXT: [[L_B_2:%.*]] = load i32, ptr [[GEP_B_2]], align 4
582-
; POW2-ONLY-NEXT: [[MUL_0:%.*]] = mul nsw i32 [[L_A_0]], [[L_B_0]]
583-
; POW2-ONLY-NEXT: [[MUL_1:%.*]] = mul nsw i32 [[L_A_1]], [[L_B_1]]
571+
; POW2-ONLY-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[GEP_A_0]], align 4
572+
; POW2-ONLY-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[GEP_B_0]], align 4
573+
; POW2-ONLY-NEXT: [[TMP3:%.*]] = mul nsw <2 x i32> [[TMP1]], [[TMP2]]
584574
; POW2-ONLY-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[L_A_2]], [[L_B_2]]
585-
; POW2-ONLY-NEXT: [[ADD_0:%.*]] = add i32 [[MUL_1]], [[MUL_0]]
575+
; POW2-ONLY-NEXT: [[ADD_0:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[TMP3]])
586576
; POW2-ONLY-NEXT: [[ADD_1:%.*]] = add i32 [[ADD_0]], [[MUL_2]]
587577
; POW2-ONLY-NEXT: ret i32 [[ADD_1]]
588578
;
@@ -630,9 +620,7 @@ define float @dot_product_fp32(ptr %a, ptr %b) {
630620
; POW2-ONLY-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[GEP_B_0]], align 4
631621
; POW2-ONLY-NEXT: [[TMP3:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP2]]
632622
; POW2-ONLY-NEXT: [[MUL_2:%.*]] = fmul fast float [[L_A_2]], [[L_B_2]]
633-
; POW2-ONLY-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
634-
; POW2-ONLY-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
635-
; POW2-ONLY-NEXT: [[ADD_0:%.*]] = fadd fast float [[TMP4]], [[TMP5]]
623+
; POW2-ONLY-NEXT: [[ADD_0:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP3]])
636624
; POW2-ONLY-NEXT: [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]]
637625
; POW2-ONLY-NEXT: ret float [[ADD_1]]
638626
;
@@ -682,9 +670,7 @@ define float @dot_product_fp32_reorder(ptr %a, ptr %b) {
682670
; POW2-ONLY-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[GEP_B_0]], align 4
683671
; POW2-ONLY-NEXT: [[TMP3:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP2]]
684672
; POW2-ONLY-NEXT: [[MUL_2:%.*]] = fmul fast float [[L_A_2]], [[L_B_2]]
685-
; POW2-ONLY-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
686-
; POW2-ONLY-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
687-
; POW2-ONLY-NEXT: [[ADD_0:%.*]] = fadd fast float [[TMP5]], [[TMP4]]
673+
; POW2-ONLY-NEXT: [[ADD_0:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP3]])
688674
; POW2-ONLY-NEXT: [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]]
689675
; POW2-ONLY-NEXT: ret float [[ADD_1]]
690676
;
@@ -733,9 +719,7 @@ define double @dot_product_fp64(ptr %a, ptr %b) {
733719
; POW2-ONLY-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[GEP_B_0]], align 4
734720
; POW2-ONLY-NEXT: [[TMP3:%.*]] = fmul fast <2 x double> [[TMP1]], [[TMP2]]
735721
; POW2-ONLY-NEXT: [[MUL_2:%.*]] = fmul fast double [[L_A_2]], [[L_B_2]]
736-
; POW2-ONLY-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
737-
; POW2-ONLY-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
738-
; POW2-ONLY-NEXT: [[ADD_0:%.*]] = fadd fast double [[TMP4]], [[TMP5]]
722+
; POW2-ONLY-NEXT: [[ADD_0:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> [[TMP3]])
739723
; POW2-ONLY-NEXT: [[ADD_1:%.*]] = fadd fast double [[ADD_0]], [[MUL_2]]
740724
; POW2-ONLY-NEXT: ret double [[ADD_1]]
741725
;

0 commit comments

Comments
 (0)