[SLP] Emit reduction instead of 2 extracts + scalar op, when vectorizing operands #147583

alexey-bataev · 2025-07-08T18:37:55Z

Added emission of the 2-element reduction instead of 2 extracts + scalar op, when trying to vectorize operands of the instruction, if it is more profitable.

Created using spr 1.3.5

llvmbot · 2025-07-08T18:38:32Z

@llvm/pr-subscribers-vectorizers

@llvm/pr-subscribers-llvm-transforms

Author: Alexey Bataev (alexey-bataev)

Changes

Added emission of the 2-element reduction instead of 2 extracts + scalar op, when trying to vectorize operands of the instruction, if it is more profitable.

Patch is 36.41 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/147583.diff

8 Files Affected:

(modified) llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp (+116-53)
(modified) llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll (+2-6)
(modified) llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll (+16-14)
(modified) llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll (+2-8)
(modified) llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll (+6-18)
(modified) llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll (+11-27)
(modified) llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll (+80-48)
(modified) llvm/test/Transforms/SLPVectorizer/extracts-with-undefs.ll (+3-3)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index c93af749507f8..bec393051a257 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -21676,58 +21676,6 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
   return Changed;
 }
 
-bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
-  if (!I)
-    return false;
-
-  if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType()))
-    return false;
-
-  Value *P = I->getParent();
-
-  // Vectorize in current basic block only.
-  auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
-  auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
-  if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P ||
-      R.isDeleted(Op0) || R.isDeleted(Op1))
-    return false;
-
-  // First collect all possible candidates
-  SmallVector<std::pair<Value *, Value *>, 4> Candidates;
-  Candidates.emplace_back(Op0, Op1);
-
-  auto *A = dyn_cast<BinaryOperator>(Op0);
-  auto *B = dyn_cast<BinaryOperator>(Op1);
-  // Try to skip B.
-  if (A && B && B->hasOneUse()) {
-    auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
-    auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
-    if (B0 && B0->getParent() == P && !R.isDeleted(B0))
-      Candidates.emplace_back(A, B0);
-    if (B1 && B1->getParent() == P && !R.isDeleted(B1))
-      Candidates.emplace_back(A, B1);
-  }
-  // Try to skip A.
-  if (B && A && A->hasOneUse()) {
-    auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
-    auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
-    if (A0 && A0->getParent() == P && !R.isDeleted(A0))
-      Candidates.emplace_back(A0, B);
-    if (A1 && A1->getParent() == P && !R.isDeleted(A1))
-      Candidates.emplace_back(A1, B);
-  }
-
-  if (Candidates.size() == 1)
-    return tryToVectorizeList({Op0, Op1}, R);
-
-  // We have multiple options. Try to pick the single best.
-  std::optional<int> BestCandidate = R.findBestRootPair(Candidates);
-  if (!BestCandidate)
-    return false;
-  return tryToVectorizeList(
-      {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second}, R);
-}
-
 namespace {
 
 /// Model horizontal reductions.
@@ -21770,6 +21718,8 @@ class HorizontalReduction {
   /// Checks if the optimization of original scalar identity operations on
   /// matched horizontal reductions is enabled and allowed.
   bool IsSupportedHorRdxIdentityOp = false;
+  /// The minimum number of the reduced values.
+  const unsigned ReductionLimit = VectorizeNonPowerOf2 ? 3 : 4;
   /// Contains vector values for reduction including their scale factor and
   /// signedness.
   SmallVector<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales;
@@ -22068,6 +22018,24 @@ class HorizontalReduction {
 
 public:
   HorizontalReduction() = default;
+  HorizontalReduction(Instruction *I, ArrayRef<Value *> Ops)
+      : ReductionRoot(I), ReductionLimit(2) {
+    RdxKind = HorizontalReduction::getRdxKind(I);
+    ReductionOps.emplace_back().push_back(I);
+    ReducedVals.emplace_back().assign(Ops.begin(), Ops.end());
+    for (Value *V : Ops)
+      ReducedValsToOps[V].push_back(I);
+  }
+
+  bool matchReductionForOperands() const {
+    // Analyze "regular" integer/FP types for reductions - no target-specific
+    // types or pointers.
+    assert(ReductionRoot && "Reduction root is not set!");
+    if (!isVectorizable(RdxKind, cast<Instruction>(ReductionRoot)))
+      return false;
+
+    return true;
+  }
 
   /// Try to find a reduction tree.
   bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
@@ -22235,7 +22203,6 @@ class HorizontalReduction {
   /// Attempt to vectorize the tree found by matchAssociativeReduction.
   Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
                      const TargetLibraryInfo &TLI, AssumptionCache *AC) {
-    const unsigned ReductionLimit = VectorizeNonPowerOf2 ? 3 : 4;
     constexpr unsigned RegMaxNumber = 4;
     constexpr unsigned RedValsMaxNumber = 128;
     // If there are a sufficient number of reduction values, reduce
@@ -23740,6 +23707,102 @@ bool SLPVectorizerPass::vectorizeHorReduction(
   return Res;
 }
 
+bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
+  if (!I)
+    return false;
+
+  if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType()))
+    return false;
+
+  Value *P = I->getParent();
+
+  // Vectorize in current basic block only.
+  auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
+  auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
+  if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P ||
+      R.isDeleted(Op0) || R.isDeleted(Op1))
+    return false;
+
+  // First collect all possible candidates
+  SmallVector<std::pair<Value *, Value *>, 4> Candidates;
+  Candidates.emplace_back(Op0, Op1);
+
+  auto *A = dyn_cast<BinaryOperator>(Op0);
+  auto *B = dyn_cast<BinaryOperator>(Op1);
+  // Try to skip B.
+  if (A && B && B->hasOneUse()) {
+    auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
+    auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
+    if (B0 && B0->getParent() == P && !R.isDeleted(B0))
+      Candidates.emplace_back(A, B0);
+    if (B1 && B1->getParent() == P && !R.isDeleted(B1))
+      Candidates.emplace_back(A, B1);
+  }
+  // Try to skip A.
+  if (B && A && A->hasOneUse()) {
+    auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
+    auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
+    if (A0 && A0->getParent() == P && !R.isDeleted(A0))
+      Candidates.emplace_back(A0, B);
+    if (A1 && A1->getParent() == P && !R.isDeleted(A1))
+      Candidates.emplace_back(A1, B);
+  }
+
+  auto TryToReduce = [this, &R, &TTI=*TTI](Instruction *Inst, ArrayRef<Value *> Ops) {
+    if (!isReductionCandidate(Inst))
+      return false;
+    Type *Ty = Inst->getType();
+    if (!isValidElementType(Ty) || Ty->isPointerTy())
+      return false;
+    HorizontalReduction HorRdx(Inst, Ops);
+    if (!HorRdx.matchReductionForOperands())
+      return false;
+    // Check the cost of operations.
+    VectorType *VecTy= getWidenedType(Ty, Ops.size());
+    constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+    InstructionCost ScalarCost =
+        TTI.getScalarizationOverhead(
+            VecTy, APInt::getAllOnes(getNumElements(VecTy)), /*Insert=*/false,
+            /*Extract=*/true, CostKind) +
+        TTI.getInstructionCost(Inst, CostKind);
+    InstructionCost RedCost;
+    switch (::getRdxKind(Inst)) {
+    case RecurKind::Add:
+    case RecurKind::Mul:
+    case RecurKind::Or:
+    case RecurKind::And:
+    case RecurKind::Xor:
+    case RecurKind::FAdd:
+    case RecurKind::FMul: {
+      FastMathFlags FMF;
+      if (auto *FPCI = dyn_cast<FPMathOperator>(Inst))
+        FMF = FPCI->getFastMathFlags();
+      RedCost = TTI.getArithmeticReductionCost(Inst->getOpcode(), VecTy, FMF,
+                                               CostKind);
+      break;
+    }
+    default:
+      return false;
+    }
+    if (RedCost >= ScalarCost)
+      return false;
+
+    return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC) != nullptr;
+  };
+  if (Candidates.size() == 1)
+    return TryToReduce(I, {Op0, Op1}) || tryToVectorizeList({Op0, Op1}, R);
+
+  // We have multiple options. Try to pick the single best.
+  std::optional<int> BestCandidate = R.findBestRootPair(Candidates);
+  if (!BestCandidate)
+    return false;
+  return TryToReduce(I, {Candidates[*BestCandidate].first,
+                         Candidates[*BestCandidate].second}) ||
+         tryToVectorizeList({Candidates[*BestCandidate].first,
+                             Candidates[*BestCandidate].second},
+                            R);
+}
+
 bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root,
                                                  BasicBlock *BB, BoUpSLP &R) {
   SmallVector<WeakTrackingVH> PostponedInsts;
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll
index 19b6d82818532..442769937ac12 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll
@@ -16,9 +16,7 @@ define void @test1(ptr nocapture readonly %J, i32 %xmin, i32 %ymin) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, ptr [[J:%.*]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = fsub fast <2 x float> [[TMP2]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast <2 x float> [[TMP5]], [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x float> [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x float> [[TMP6]], i32 1
-; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[ADD:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP6]])
 ; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq float [[ADD]], 0.000000e+00
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY3_LR_PH]], label [[FOR_END27:%.*]]
 ; CHECK:       for.end27:
@@ -57,9 +55,7 @@ define void @test2(ptr nocapture readonly %J, i32 %xmin, i32 %ymin) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, ptr [[J:%.*]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = fsub fast <2 x float> [[TMP2]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast <2 x float> [[TMP5]], [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x float> [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x float> [[TMP6]], i32 1
-; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[ADD:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP6]])
 ; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq float [[ADD]], 0.000000e+00
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY3_LR_PH]], label [[FOR_END27:%.*]]
 ; CHECK:       for.end27:
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll
index 677d52bf3b4c3..8d4a1152fe4da 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll
@@ -3,13 +3,19 @@
 ; RUN: opt < %s -S -passes=slp-vectorizer -mtriple=aarch64-unknown-linux -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK,FULLFP16
 
 define half @reduce_fast_half2(<2 x half> %vec2) {
-; CHECK-LABEL: define half @reduce_fast_half2(
-; CHECK-SAME: <2 x half> [[VEC2:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <2 x half> [[VEC2]], i64 0
-; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x half> [[VEC2]], i64 1
-; CHECK-NEXT:    [[ADD1:%.*]] = fadd fast half [[ELT1]], [[ELT0]]
-; CHECK-NEXT:    ret half [[ADD1]]
+; NOFP16-LABEL: define half @reduce_fast_half2(
+; NOFP16-SAME: <2 x half> [[VEC2:%.*]]) #[[ATTR0:[0-9]+]] {
+; NOFP16-NEXT:  [[ENTRY:.*:]]
+; NOFP16-NEXT:    [[ELT0:%.*]] = extractelement <2 x half> [[VEC2]], i64 0
+; NOFP16-NEXT:    [[ELT1:%.*]] = extractelement <2 x half> [[VEC2]], i64 1
+; NOFP16-NEXT:    [[ADD1:%.*]] = fadd fast half [[ELT1]], [[ELT0]]
+; NOFP16-NEXT:    ret half [[ADD1]]
+;
+; FULLFP16-LABEL: define half @reduce_fast_half2(
+; FULLFP16-SAME: <2 x half> [[VEC2:%.*]]) #[[ATTR0:[0-9]+]] {
+; FULLFP16-NEXT:  [[ENTRY:.*:]]
+; FULLFP16-NEXT:    [[TMP0:%.*]] = call fast half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> [[VEC2]])
+; FULLFP16-NEXT:    ret half [[TMP0]]
 ;
 entry:
   %elt0 = extractelement <2 x half> %vec2, i64 0
@@ -20,7 +26,7 @@ entry:
 
 define half @reduce_half2(<2 x half> %vec2) {
 ; CHECK-LABEL: define half @reduce_half2(
-; CHECK-SAME: <2 x half> [[VEC2:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <2 x half> [[VEC2:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <2 x half> [[VEC2]], i64 0
 ; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x half> [[VEC2]], i64 1
@@ -269,9 +275,7 @@ define float @reduce_fast_float2(<2 x float> %vec2) {
 ; CHECK-LABEL: define float @reduce_fast_float2(
 ; CHECK-SAME: <2 x float> [[VEC2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <2 x float> [[VEC2]], i64 0
-; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x float> [[VEC2]], i64 1
-; CHECK-NEXT:    [[ADD1:%.*]] = fadd fast float [[ELT1]], [[ELT0]]
+; CHECK-NEXT:    [[ADD1:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[VEC2]])
 ; CHECK-NEXT:    ret float [[ADD1]]
 ;
 entry:
@@ -409,9 +413,7 @@ define double @reduce_fast_double2(<2 x double> %vec2) {
 ; CHECK-LABEL: define double @reduce_fast_double2(
 ; CHECK-SAME: <2 x double> [[VEC2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <2 x double> [[VEC2]], i64 0
-; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x double> [[VEC2]], i64 1
-; CHECK-NEXT:    [[ADD1:%.*]] = fadd fast double [[ELT1]], [[ELT0]]
+; CHECK-NEXT:    [[ADD1:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> [[VEC2]])
 ; CHECK-NEXT:    ret double [[ADD1]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll
index 03f67ecb3e695..543f19225d74f 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll
@@ -216,9 +216,7 @@ define void @slp_profitable_missing_fmf_nnans_only(ptr %A, ptr %B) {
 define float @slp_not_profitable_in_loop(float %x, ptr %A) {
 ; CHECK-LABEL: @slp_not_profitable_in_loop(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[GEP_A_2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 2
-; CHECK-NEXT:    [[L_1:%.*]] = load float, ptr [[GEP_A_2]], align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[A]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[A:%.*]], align 4
 ; CHECK-NEXT:    [[L_3:%.*]] = load float, ptr [[A]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> <float poison, float 3.000000e+00>, float [[X:%.*]], i32 0
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
@@ -226,12 +224,8 @@ define float @slp_not_profitable_in_loop(float %x, ptr %A) {
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[RED:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[RED_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP0]]
-; CHECK-NEXT:    [[MUL12:%.*]] = fmul fast float 3.000000e+00, [[L_1]]
 ; CHECK-NEXT:    [[MUL16:%.*]] = fmul fast float 3.000000e+00, [[L_3]]
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
-; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[MUL12]], [[TMP3]]
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
-; CHECK-NEXT:    [[ADD13:%.*]] = fadd fast float [[ADD]], [[TMP4]]
+; CHECK-NEXT:    [[ADD13:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP2]])
 ; CHECK-NEXT:    [[RED_NEXT]] = fadd fast float [[ADD13]], [[MUL16]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV]], 10
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
index 651f565412830..1116f8a7fbe27 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
@@ -141,33 +141,21 @@ define ptr @test4() {
 ; POWEROF2-NEXT:    [[TMP1:%.*]] = fadd <8 x float> zeroinitializer, zeroinitializer
 ; POWEROF2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
 ; POWEROF2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 5, i32 6>
-; POWEROF2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 0, i32 4>
 ; POWEROF2-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> poison, <2 x float> [[TMP2]], i64 0)
 ; POWEROF2-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP5]], <2 x float> [[TMP3]], i64 2)
 ; POWEROF2-NEXT:    br label [[TMP8:%.*]]
-; POWEROF2:       7:
+; POWEROF2:       6:
 ; POWEROF2-NEXT:    br label [[TMP8]]
-; POWEROF2:       8:
-; POWEROF2-NEXT:    [[TMP9:%.*]] = phi <2 x float> [ poison, [[TMP7:%.*]] ], [ [[TMP4]], [[TMP0:%.*]] ]
-; POWEROF2-NEXT:    [[TMP10:%.*]] = phi <4 x float> [ poison, [[TMP7]] ], [ [[TMP6]], [[TMP0]] ]
+; POWEROF2:       7:
+; POWEROF2-NEXT:    [[TMP10:%.*]] = phi <4 x float> [ poison, [[TMP7:%.*]] ], [ [[TMP6]], [[TMP0:%.*]] ]
 ; POWEROF2-NEXT:    br label [[TMP11:%.*]]
-; POWEROF2:       11:
+; POWEROF2:       9:
 ; POWEROF2-NEXT:    [[TMP12:%.*]] = call <2 x float> @llvm.vector.extract.v2f32.v4f32(<4 x float> [[TMP10]], i64 0)
 ; POWEROF2-NEXT:    [[TMP13:%.*]] = fmul <2 x float> [[TMP12]], zeroinitializer
 ; POWEROF2-NEXT:    [[TMP14:%.*]] = call <2 x float> @llvm.vector.extract.v2f32.v4f32(<4 x float> [[TMP10]], i64 2)
 ; POWEROF2-NEXT:    [[TMP15:%.*]] = fmul <2 x float> zeroinitializer, [[TMP14]]
-; POWEROF2-NEXT:    [[TMP18:%.*]] = extractelement <2 x float> [[TMP9]], i32 0
-; POWEROF2-NEXT:    [[TMP17:%.*]] = fmul float 0.000000e+00, [[TMP18]]
-; POWEROF2-NEXT:    [[TMP30:%.*]] = extractelement <2 x float> [[TMP9]], i32 1
-; POWEROF2-NEXT:    [[TMP19:%.*]] = fmul float [[TMP30]], 0.000000e+00
-; POWEROF2-NEXT:    [[TMP20:%.*]] = extractelement <2 x float> [[TMP13]], i32 0
-; POWEROF2-NEXT:    [[TMP21:%.*]] = fadd reassoc nsz float [[TMP20]], [[TMP17]]
-; POWEROF2-NEXT:    [[TMP22:%.*]] = extractelement <2 x float> [[TMP15]], i32 0
-; POWEROF2-NEXT:    [[TMP23:%.*]] = fadd reassoc nsz float [[TMP22]], [[TMP19]]
-; POWEROF2-NEXT:    [[TMP24:%.*]] = extractelement <2 x float> [[TMP13]], i32 1
-; POWEROF2-NEXT:    [[TMP25:%.*]] = fadd reassoc nsz float [[TMP21]], [[TMP24]]
-; POWEROF2-NEXT:    [[TMP26:%.*]] = extractelement <2 x float> [[TMP15]], i32 1
-; POWEROF2-NEXT:    [[TMP27:%.*]] = fadd reassoc nsz float [[TMP23]], [[TMP26]]
+; POWEROF2-NEXT:    [[TMP25:%.*]] = call reassoc nsz float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP13]])
+; POWEROF2-NEXT:    [[TMP27:%.*]] = call reassoc nsz float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP15]])
 ; POWEROF2-NEXT:    [[TMP28:%.*]] = tail call float @llvm.sqrt.f32(float [[TMP25]])
 ; POWEROF2-NEXT:    [[TMP29:%.*]] = tail call float @llvm.sqrt.f32(float [[TMP27]])
 ; POWEROF2-NEXT:    ret ptr null
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll
index 481d586e6658a..27de36e601512 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll
@@ -513,21 +513,16 @@ define i32 @dot_product_i32(ptr %a, ptr %b) {
 ;
 ; POW2-ONLY-LABEL: @dot_product_i32(
 ; POW2-ONLY-NEXT:    [[GEP_A_0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 0
-; POW2-ONLY-NEXT:    [[L_A_0:%.*]] = load i32, ptr [[GEP_A_0]], align 4
-; POW2-ONLY-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 1
-; POW2-ONLY-NEXT:    [[L_A_1:%.*]] = load i32, ptr [[GEP_A_1]], align 4
 ; POW2-ONLY-NEXT:    [[GEP_A_2:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 2
 ; POW2-ONLY-NEXT:    [[L_A_2:%.*]] = load i32, ptr [[GEP_A_2]], align 4
 ; POW2-ONLY-NEXT:    [[GEP_B_0:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 0
-; POW2-ONLY-NEXT:    [[L_B_0:%.*]] = load i32, ptr [[GEP_B_0]], align 4
-; POW2-ONLY-NEXT:    [[GEP_B_1:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 1
-; POW2-ONLY-NEXT:    [[L_B_1:%.*]] = load i32, ptr [[GEP_B_1]], align 4
 ; POW2-ONLY-NEXT:    [[GEP_B_2:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 2
 ; POW2-ONLY-NEXT:    [[L_B_2:%.*]] = load i32, ptr [[GEP_B_2]], align 4
-; POW2-ONLY-NEXT:    [[MUL_0:%.*]] = mul nsw i32 [[L_A_0]], [[L_B_0]]
-; POW2-ONLY-NEXT:    [[MUL_1:%.*]] = mul nsw i32 [[L_A_1]], [[L_B_1]]
+; POW2-ONLY-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[GEP_A_0]], align 4
+; POW2-ONLY-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[GEP_B_0]...
[truncated]

github-actions · 2025-07-08T18:40:53Z

✅ With the latest revision this PR passed the C/C++ code formatter.

Created using spr 1.3.5

preames

I'm supportive of the direction, and the RISCV changes look fine.

RKSimon · 2025-07-09T09:28:39Z

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

-    return false;
-  return tryToVectorizeList(
-      {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second}, R);
-}


please can you move this as a NFC and the rebase? I can't tell if there's any changes to the implementation

Created using spr 1.3.5

RKSimon

LGTM with a a couple of minors

RKSimon · 2025-07-09T15:29:20Z

llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll

-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,CHECK-NOAVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,CHECK-AVX


Technically corei7 is not AVX - its a SSE4.2 cpu, but its costs are close enough that its fine to keep it on CHECK-AVX for now.

I can rename to SSE/NONSSE instead

thanks but no need - but I might alter the tested cpus in a future patch though

RKSimon · 2025-07-09T15:30:17Z

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

+    case RecurKind::And:
+    case RecurKind::Xor:
+    case RecurKind::FAdd:
+    case RecurKind::FMul: {


Can't the integer min/max kinds be included here?

We do not vectorize it here for now, just binaryoperations and compares

Created using spr 1.3.5

[𝘀𝗽𝗿] initial version

84ff08f

Created using spr 1.3.5

llvmbot added vectorizers llvm:transforms labels Jul 8, 2025

alexey-bataev requested review from preames, hiraditya and RKSimon July 8, 2025 18:38

Fix formatting

85f3eab

Created using spr 1.3.5

preames reviewed Jul 8, 2025

View reviewed changes

RKSimon reviewed Jul 9, 2025

View reviewed changes

Rebase

0c24c4b

Created using spr 1.3.5

RKSimon approved these changes Jul 9, 2025

View reviewed changes

Rebase, supported ordered reductions

0cc841a

Created using spr 1.3.5

davemgreen mentioned this pull request Jul 9, 2025

[AArch64] Scalarize v2f16 vecreduce.fadd #147783

Open

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[SLP] Emit reduction instead of 2 extracts + scalar op, when vectorizing operands #147583

[SLP] Emit reduction instead of 2 extracts + scalar op, when vectorizing operands #147583

Uh oh!

alexey-bataev commented Jul 8, 2025

Uh oh!

llvmbot commented Jul 8, 2025 •

edited

Loading

Uh oh!

github-actions bot commented Jul 8, 2025 •

edited

Loading

Uh oh!

preames left a comment

Uh oh!

RKSimon Jul 9, 2025

Uh oh!

RKSimon left a comment

Uh oh!

RKSimon Jul 9, 2025

Uh oh!

alexey-bataev Jul 9, 2025

Uh oh!

RKSimon Jul 9, 2025

Uh oh!

RKSimon Jul 9, 2025

Uh oh!

alexey-bataev Jul 9, 2025

Uh oh!

Uh oh!

[SLP] Emit reduction instead of 2 extracts + scalar op, when vectorizing operands #147583

Are you sure you want to change the base?

[SLP] Emit reduction instead of 2 extracts + scalar op, when vectorizing operands #147583

Uh oh!

Conversation

alexey-bataev commented Jul 8, 2025

Uh oh!

llvmbot commented Jul 8, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

github-actions bot commented Jul 8, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

preames left a comment

Choose a reason for hiding this comment

Uh oh!

RKSimon Jul 9, 2025

Choose a reason for hiding this comment

Uh oh!

RKSimon left a comment

Choose a reason for hiding this comment

Uh oh!

RKSimon Jul 9, 2025

Choose a reason for hiding this comment

Uh oh!

alexey-bataev Jul 9, 2025

Choose a reason for hiding this comment

Uh oh!

RKSimon Jul 9, 2025

Choose a reason for hiding this comment

Uh oh!

RKSimon Jul 9, 2025

Choose a reason for hiding this comment

Uh oh!

alexey-bataev Jul 9, 2025

Choose a reason for hiding this comment

Uh oh!

Uh oh!

llvmbot commented Jul 8, 2025 •

edited

Loading

github-actions bot commented Jul 8, 2025 •

edited

Loading