-
Notifications
You must be signed in to change notification settings - Fork 14.4k
[SLP] Emit reduction instead of 2 extracts + scalar op, when vectorizing operands #147583
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
Created using spr 1.3.5
@llvm/pr-subscribers-vectorizers @llvm/pr-subscribers-llvm-transforms Author: Alexey Bataev (alexey-bataev) ChangesAdded emission of the 2-element reduction instead of 2 extracts + scalar op, when trying to vectorize operands of the instruction, if it is more profitable. Patch is 36.41 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/147583.diff 8 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index c93af749507f8..bec393051a257 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -21676,58 +21676,6 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
return Changed;
}
-bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
- if (!I)
- return false;
-
- if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType()))
- return false;
-
- Value *P = I->getParent();
-
- // Vectorize in current basic block only.
- auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
- auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
- if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P ||
- R.isDeleted(Op0) || R.isDeleted(Op1))
- return false;
-
- // First collect all possible candidates
- SmallVector<std::pair<Value *, Value *>, 4> Candidates;
- Candidates.emplace_back(Op0, Op1);
-
- auto *A = dyn_cast<BinaryOperator>(Op0);
- auto *B = dyn_cast<BinaryOperator>(Op1);
- // Try to skip B.
- if (A && B && B->hasOneUse()) {
- auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
- auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
- if (B0 && B0->getParent() == P && !R.isDeleted(B0))
- Candidates.emplace_back(A, B0);
- if (B1 && B1->getParent() == P && !R.isDeleted(B1))
- Candidates.emplace_back(A, B1);
- }
- // Try to skip A.
- if (B && A && A->hasOneUse()) {
- auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
- auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
- if (A0 && A0->getParent() == P && !R.isDeleted(A0))
- Candidates.emplace_back(A0, B);
- if (A1 && A1->getParent() == P && !R.isDeleted(A1))
- Candidates.emplace_back(A1, B);
- }
-
- if (Candidates.size() == 1)
- return tryToVectorizeList({Op0, Op1}, R);
-
- // We have multiple options. Try to pick the single best.
- std::optional<int> BestCandidate = R.findBestRootPair(Candidates);
- if (!BestCandidate)
- return false;
- return tryToVectorizeList(
- {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second}, R);
-}
-
namespace {
/// Model horizontal reductions.
@@ -21770,6 +21718,8 @@ class HorizontalReduction {
/// Checks if the optimization of original scalar identity operations on
/// matched horizontal reductions is enabled and allowed.
bool IsSupportedHorRdxIdentityOp = false;
+ /// The minimum number of the reduced values.
+ const unsigned ReductionLimit = VectorizeNonPowerOf2 ? 3 : 4;
/// Contains vector values for reduction including their scale factor and
/// signedness.
SmallVector<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales;
@@ -22068,6 +22018,24 @@ class HorizontalReduction {
public:
HorizontalReduction() = default;
+ HorizontalReduction(Instruction *I, ArrayRef<Value *> Ops)
+ : ReductionRoot(I), ReductionLimit(2) {
+ RdxKind = HorizontalReduction::getRdxKind(I);
+ ReductionOps.emplace_back().push_back(I);
+ ReducedVals.emplace_back().assign(Ops.begin(), Ops.end());
+ for (Value *V : Ops)
+ ReducedValsToOps[V].push_back(I);
+ }
+
+ bool matchReductionForOperands() const {
+ // Analyze "regular" integer/FP types for reductions - no target-specific
+ // types or pointers.
+ assert(ReductionRoot && "Reduction root is not set!");
+ if (!isVectorizable(RdxKind, cast<Instruction>(ReductionRoot)))
+ return false;
+
+ return true;
+ }
/// Try to find a reduction tree.
bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
@@ -22235,7 +22203,6 @@ class HorizontalReduction {
/// Attempt to vectorize the tree found by matchAssociativeReduction.
Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
const TargetLibraryInfo &TLI, AssumptionCache *AC) {
- const unsigned ReductionLimit = VectorizeNonPowerOf2 ? 3 : 4;
constexpr unsigned RegMaxNumber = 4;
constexpr unsigned RedValsMaxNumber = 128;
// If there are a sufficient number of reduction values, reduce
@@ -23740,6 +23707,102 @@ bool SLPVectorizerPass::vectorizeHorReduction(
return Res;
}
+bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
+ if (!I)
+ return false;
+
+ if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType()))
+ return false;
+
+ Value *P = I->getParent();
+
+ // Vectorize in current basic block only.
+ auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
+ auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
+ if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P ||
+ R.isDeleted(Op0) || R.isDeleted(Op1))
+ return false;
+
+ // First collect all possible candidates
+ SmallVector<std::pair<Value *, Value *>, 4> Candidates;
+ Candidates.emplace_back(Op0, Op1);
+
+ auto *A = dyn_cast<BinaryOperator>(Op0);
+ auto *B = dyn_cast<BinaryOperator>(Op1);
+ // Try to skip B.
+ if (A && B && B->hasOneUse()) {
+ auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
+ auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
+ if (B0 && B0->getParent() == P && !R.isDeleted(B0))
+ Candidates.emplace_back(A, B0);
+ if (B1 && B1->getParent() == P && !R.isDeleted(B1))
+ Candidates.emplace_back(A, B1);
+ }
+ // Try to skip A.
+ if (B && A && A->hasOneUse()) {
+ auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
+ auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
+ if (A0 && A0->getParent() == P && !R.isDeleted(A0))
+ Candidates.emplace_back(A0, B);
+ if (A1 && A1->getParent() == P && !R.isDeleted(A1))
+ Candidates.emplace_back(A1, B);
+ }
+
+ auto TryToReduce = [this, &R, &TTI=*TTI](Instruction *Inst, ArrayRef<Value *> Ops) {
+ if (!isReductionCandidate(Inst))
+ return false;
+ Type *Ty = Inst->getType();
+ if (!isValidElementType(Ty) || Ty->isPointerTy())
+ return false;
+ HorizontalReduction HorRdx(Inst, Ops);
+ if (!HorRdx.matchReductionForOperands())
+ return false;
+ // Check the cost of operations.
+ VectorType *VecTy= getWidenedType(Ty, Ops.size());
+ constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+ InstructionCost ScalarCost =
+ TTI.getScalarizationOverhead(
+ VecTy, APInt::getAllOnes(getNumElements(VecTy)), /*Insert=*/false,
+ /*Extract=*/true, CostKind) +
+ TTI.getInstructionCost(Inst, CostKind);
+ InstructionCost RedCost;
+ switch (::getRdxKind(Inst)) {
+ case RecurKind::Add:
+ case RecurKind::Mul:
+ case RecurKind::Or:
+ case RecurKind::And:
+ case RecurKind::Xor:
+ case RecurKind::FAdd:
+ case RecurKind::FMul: {
+ FastMathFlags FMF;
+ if (auto *FPCI = dyn_cast<FPMathOperator>(Inst))
+ FMF = FPCI->getFastMathFlags();
+ RedCost = TTI.getArithmeticReductionCost(Inst->getOpcode(), VecTy, FMF,
+ CostKind);
+ break;
+ }
+ default:
+ return false;
+ }
+ if (RedCost >= ScalarCost)
+ return false;
+
+ return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC) != nullptr;
+ };
+ if (Candidates.size() == 1)
+ return TryToReduce(I, {Op0, Op1}) || tryToVectorizeList({Op0, Op1}, R);
+
+ // We have multiple options. Try to pick the single best.
+ std::optional<int> BestCandidate = R.findBestRootPair(Candidates);
+ if (!BestCandidate)
+ return false;
+ return TryToReduce(I, {Candidates[*BestCandidate].first,
+ Candidates[*BestCandidate].second}) ||
+ tryToVectorizeList({Candidates[*BestCandidate].first,
+ Candidates[*BestCandidate].second},
+ R);
+}
+
bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root,
BasicBlock *BB, BoUpSLP &R) {
SmallVector<WeakTrackingVH> PostponedInsts;
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll
index 19b6d82818532..442769937ac12 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll
@@ -16,9 +16,7 @@ define void @test1(ptr nocapture readonly %J, i32 %xmin, i32 %ymin) {
; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr [[J:%.*]], align 4
; CHECK-NEXT: [[TMP5:%.*]] = fsub fast <2 x float> [[TMP2]], [[TMP4]]
; CHECK-NEXT: [[TMP6:%.*]] = fmul fast <2 x float> [[TMP5]], [[TMP5]]
-; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[TMP6]], i32 0
-; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[TMP6]], i32 1
-; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP7]], [[TMP8]]
+; CHECK-NEXT: [[ADD:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP6]])
; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq float [[ADD]], 0.000000e+00
; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY3_LR_PH]], label [[FOR_END27:%.*]]
; CHECK: for.end27:
@@ -57,9 +55,7 @@ define void @test2(ptr nocapture readonly %J, i32 %xmin, i32 %ymin) {
; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr [[J:%.*]], align 4
; CHECK-NEXT: [[TMP5:%.*]] = fsub fast <2 x float> [[TMP2]], [[TMP4]]
; CHECK-NEXT: [[TMP6:%.*]] = fmul fast <2 x float> [[TMP5]], [[TMP5]]
-; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[TMP6]], i32 0
-; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[TMP6]], i32 1
-; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP8]], [[TMP7]]
+; CHECK-NEXT: [[ADD:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP6]])
; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq float [[ADD]], 0.000000e+00
; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY3_LR_PH]], label [[FOR_END27:%.*]]
; CHECK: for.end27:
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll
index 677d52bf3b4c3..8d4a1152fe4da 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll
@@ -3,13 +3,19 @@
; RUN: opt < %s -S -passes=slp-vectorizer -mtriple=aarch64-unknown-linux -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK,FULLFP16
define half @reduce_fast_half2(<2 x half> %vec2) {
-; CHECK-LABEL: define half @reduce_fast_half2(
-; CHECK-SAME: <2 x half> [[VEC2:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[ELT0:%.*]] = extractelement <2 x half> [[VEC2]], i64 0
-; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x half> [[VEC2]], i64 1
-; CHECK-NEXT: [[ADD1:%.*]] = fadd fast half [[ELT1]], [[ELT0]]
-; CHECK-NEXT: ret half [[ADD1]]
+; NOFP16-LABEL: define half @reduce_fast_half2(
+; NOFP16-SAME: <2 x half> [[VEC2:%.*]]) #[[ATTR0:[0-9]+]] {
+; NOFP16-NEXT: [[ENTRY:.*:]]
+; NOFP16-NEXT: [[ELT0:%.*]] = extractelement <2 x half> [[VEC2]], i64 0
+; NOFP16-NEXT: [[ELT1:%.*]] = extractelement <2 x half> [[VEC2]], i64 1
+; NOFP16-NEXT: [[ADD1:%.*]] = fadd fast half [[ELT1]], [[ELT0]]
+; NOFP16-NEXT: ret half [[ADD1]]
+;
+; FULLFP16-LABEL: define half @reduce_fast_half2(
+; FULLFP16-SAME: <2 x half> [[VEC2:%.*]]) #[[ATTR0:[0-9]+]] {
+; FULLFP16-NEXT: [[ENTRY:.*:]]
+; FULLFP16-NEXT: [[TMP0:%.*]] = call fast half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> [[VEC2]])
+; FULLFP16-NEXT: ret half [[TMP0]]
;
entry:
%elt0 = extractelement <2 x half> %vec2, i64 0
@@ -20,7 +26,7 @@ entry:
define half @reduce_half2(<2 x half> %vec2) {
; CHECK-LABEL: define half @reduce_half2(
-; CHECK-SAME: <2 x half> [[VEC2:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <2 x half> [[VEC2:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[ELT0:%.*]] = extractelement <2 x half> [[VEC2]], i64 0
; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x half> [[VEC2]], i64 1
@@ -269,9 +275,7 @@ define float @reduce_fast_float2(<2 x float> %vec2) {
; CHECK-LABEL: define float @reduce_fast_float2(
; CHECK-SAME: <2 x float> [[VEC2:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[ELT0:%.*]] = extractelement <2 x float> [[VEC2]], i64 0
-; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[VEC2]], i64 1
-; CHECK-NEXT: [[ADD1:%.*]] = fadd fast float [[ELT1]], [[ELT0]]
+; CHECK-NEXT: [[ADD1:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[VEC2]])
; CHECK-NEXT: ret float [[ADD1]]
;
entry:
@@ -409,9 +413,7 @@ define double @reduce_fast_double2(<2 x double> %vec2) {
; CHECK-LABEL: define double @reduce_fast_double2(
; CHECK-SAME: <2 x double> [[VEC2:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[ELT0:%.*]] = extractelement <2 x double> [[VEC2]], i64 0
-; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x double> [[VEC2]], i64 1
-; CHECK-NEXT: [[ADD1:%.*]] = fadd fast double [[ELT1]], [[ELT0]]
+; CHECK-NEXT: [[ADD1:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> [[VEC2]])
; CHECK-NEXT: ret double [[ADD1]]
;
entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll
index 03f67ecb3e695..543f19225d74f 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll
@@ -216,9 +216,7 @@ define void @slp_profitable_missing_fmf_nnans_only(ptr %A, ptr %B) {
define float @slp_not_profitable_in_loop(float %x, ptr %A) {
; CHECK-LABEL: @slp_not_profitable_in_loop(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 2
-; CHECK-NEXT: [[L_1:%.*]] = load float, ptr [[GEP_A_2]], align 4
-; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[A]], align 4
+; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[A:%.*]], align 4
; CHECK-NEXT: [[L_3:%.*]] = load float, ptr [[A]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> <float poison, float 3.000000e+00>, float [[X:%.*]], i32 0
; CHECK-NEXT: br label [[LOOP:%.*]]
@@ -226,12 +224,8 @@ define float @slp_not_profitable_in_loop(float %x, ptr %A) {
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
; CHECK-NEXT: [[RED:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[RED_NEXT:%.*]], [[LOOP]] ]
; CHECK-NEXT: [[TMP2:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP0]]
-; CHECK-NEXT: [[MUL12:%.*]] = fmul fast float 3.000000e+00, [[L_1]]
; CHECK-NEXT: [[MUL16:%.*]] = fmul fast float 3.000000e+00, [[L_3]]
-; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
-; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[MUL12]], [[TMP3]]
-; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
-; CHECK-NEXT: [[ADD13:%.*]] = fadd fast float [[ADD]], [[TMP4]]
+; CHECK-NEXT: [[ADD13:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP2]])
; CHECK-NEXT: [[RED_NEXT]] = fadd fast float [[ADD13]], [[MUL16]]
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV]], 10
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
index 651f565412830..1116f8a7fbe27 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
@@ -141,33 +141,21 @@ define ptr @test4() {
; POWEROF2-NEXT: [[TMP1:%.*]] = fadd <8 x float> zeroinitializer, zeroinitializer
; POWEROF2-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
; POWEROF2-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 5, i32 6>
-; POWEROF2-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 0, i32 4>
; POWEROF2-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> poison, <2 x float> [[TMP2]], i64 0)
; POWEROF2-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP5]], <2 x float> [[TMP3]], i64 2)
; POWEROF2-NEXT: br label [[TMP8:%.*]]
-; POWEROF2: 7:
+; POWEROF2: 6:
; POWEROF2-NEXT: br label [[TMP8]]
-; POWEROF2: 8:
-; POWEROF2-NEXT: [[TMP9:%.*]] = phi <2 x float> [ poison, [[TMP7:%.*]] ], [ [[TMP4]], [[TMP0:%.*]] ]
-; POWEROF2-NEXT: [[TMP10:%.*]] = phi <4 x float> [ poison, [[TMP7]] ], [ [[TMP6]], [[TMP0]] ]
+; POWEROF2: 7:
+; POWEROF2-NEXT: [[TMP10:%.*]] = phi <4 x float> [ poison, [[TMP7:%.*]] ], [ [[TMP6]], [[TMP0:%.*]] ]
; POWEROF2-NEXT: br label [[TMP11:%.*]]
-; POWEROF2: 11:
+; POWEROF2: 9:
; POWEROF2-NEXT: [[TMP12:%.*]] = call <2 x float> @llvm.vector.extract.v2f32.v4f32(<4 x float> [[TMP10]], i64 0)
; POWEROF2-NEXT: [[TMP13:%.*]] = fmul <2 x float> [[TMP12]], zeroinitializer
; POWEROF2-NEXT: [[TMP14:%.*]] = call <2 x float> @llvm.vector.extract.v2f32.v4f32(<4 x float> [[TMP10]], i64 2)
; POWEROF2-NEXT: [[TMP15:%.*]] = fmul <2 x float> zeroinitializer, [[TMP14]]
-; POWEROF2-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[TMP9]], i32 0
-; POWEROF2-NEXT: [[TMP17:%.*]] = fmul float 0.000000e+00, [[TMP18]]
-; POWEROF2-NEXT: [[TMP30:%.*]] = extractelement <2 x float> [[TMP9]], i32 1
-; POWEROF2-NEXT: [[TMP19:%.*]] = fmul float [[TMP30]], 0.000000e+00
-; POWEROF2-NEXT: [[TMP20:%.*]] = extractelement <2 x float> [[TMP13]], i32 0
-; POWEROF2-NEXT: [[TMP21:%.*]] = fadd reassoc nsz float [[TMP20]], [[TMP17]]
-; POWEROF2-NEXT: [[TMP22:%.*]] = extractelement <2 x float> [[TMP15]], i32 0
-; POWEROF2-NEXT: [[TMP23:%.*]] = fadd reassoc nsz float [[TMP22]], [[TMP19]]
-; POWEROF2-NEXT: [[TMP24:%.*]] = extractelement <2 x float> [[TMP13]], i32 1
-; POWEROF2-NEXT: [[TMP25:%.*]] = fadd reassoc nsz float [[TMP21]], [[TMP24]]
-; POWEROF2-NEXT: [[TMP26:%.*]] = extractelement <2 x float> [[TMP15]], i32 1
-; POWEROF2-NEXT: [[TMP27:%.*]] = fadd reassoc nsz float [[TMP23]], [[TMP26]]
+; POWEROF2-NEXT: [[TMP25:%.*]] = call reassoc nsz float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP13]])
+; POWEROF2-NEXT: [[TMP27:%.*]] = call reassoc nsz float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP15]])
; POWEROF2-NEXT: [[TMP28:%.*]] = tail call float @llvm.sqrt.f32(float [[TMP25]])
; POWEROF2-NEXT: [[TMP29:%.*]] = tail call float @llvm.sqrt.f32(float [[TMP27]])
; POWEROF2-NEXT: ret ptr null
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll
index 481d586e6658a..27de36e601512 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll
@@ -513,21 +513,16 @@ define i32 @dot_product_i32(ptr %a, ptr %b) {
;
; POW2-ONLY-LABEL: @dot_product_i32(
; POW2-ONLY-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 0
-; POW2-ONLY-NEXT: [[L_A_0:%.*]] = load i32, ptr [[GEP_A_0]], align 4
-; POW2-ONLY-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 1
-; POW2-ONLY-NEXT: [[L_A_1:%.*]] = load i32, ptr [[GEP_A_1]], align 4
; POW2-ONLY-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 2
; POW2-ONLY-NEXT: [[L_A_2:%.*]] = load i32, ptr [[GEP_A_2]], align 4
; POW2-ONLY-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 0
-; POW2-ONLY-NEXT: [[L_B_0:%.*]] = load i32, ptr [[GEP_B_0]], align 4
-; POW2-ONLY-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 1
-; POW2-ONLY-NEXT: [[L_B_1:%.*]] = load i32, ptr [[GEP_B_1]], align 4
; POW2-ONLY-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 2
; POW2-ONLY-NEXT: [[L_B_2:%.*]] = load i32, ptr [[GEP_B_2]], align 4
-; POW2-ONLY-NEXT: [[MUL_0:%.*]] = mul nsw i32 [[L_A_0]], [[L_B_0]]
-; POW2-ONLY-NEXT: [[MUL_1:%.*]] = mul nsw i32 [[L_A_1]], [[L_B_1]]
+; POW2-ONLY-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[GEP_A_0]], align 4
+; POW2-ONLY-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[GEP_B_0]...
[truncated]
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
Created using spr 1.3.5
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm supportive of the direction, and the RISCV changes look fine.
return false; | ||
return tryToVectorizeList( | ||
{Candidates[*BestCandidate].first, Candidates[*BestCandidate].second}, R); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
please can you move this as a NFC and the rebase? I can't tell if there's any changes to the implementation
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM with a a couple of minors
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s | ||
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s | ||
; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,CHECK-NOAVX | ||
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,CHECK-AVX |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Technically corei7 is not AVX - its a SSE4.2 cpu, but its costs are close enough that its fine to keep it on CHECK-AVX for now.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I can rename to SSE/NONSSE instead
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
thanks but no need - but I might alter the tested cpus in a future patch though
case RecurKind::And: | ||
case RecurKind::Xor: | ||
case RecurKind::FAdd: | ||
case RecurKind::FMul: { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can't the integer min/max kinds be included here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We do not vectorize it here for now, just binaryoperations and compares
Created using spr 1.3.5
Added emission of the 2-element reduction instead of 2 extracts + scalar op, when trying to vectorize operands of the instruction, if it is more profitable.