diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index d3761ff43f437..93ed6f3ee4a8c 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -21722,6 +21722,8 @@ class HorizontalReduction { /// Checks if the optimization of original scalar identity operations on /// matched horizontal reductions is enabled and allowed. bool IsSupportedHorRdxIdentityOp = false; + /// The minimum number of the reduced values. + const unsigned ReductionLimit = VectorizeNonPowerOf2 ? 3 : 4; /// Contains vector values for reduction including their scale factor and /// signedness. SmallVector> VectorValuesAndScales; @@ -21740,7 +21742,8 @@ class HorizontalReduction { } /// Checks if instruction is associative and can be vectorized. - static bool isVectorizable(RecurKind Kind, Instruction *I) { + static bool isVectorizable(RecurKind Kind, Instruction *I, + bool TwoElementReduction = false) { if (Kind == RecurKind::None) return false; @@ -21749,6 +21752,10 @@ class HorizontalReduction { isBoolLogicOp(I)) return true; + // No need to check for associativity, if 2 reduced values. + if (TwoElementReduction) + return true; + if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) { // FP min/max are associative except for NaN and -0.0. We do not // have to rule out -0.0 here because the intrinsic semantics do not @@ -22020,6 +22027,27 @@ class HorizontalReduction { public: HorizontalReduction() = default; + HorizontalReduction(Instruction *I, ArrayRef Ops) + : ReductionRoot(I), ReductionLimit(2) { + RdxKind = HorizontalReduction::getRdxKind(I); + ReductionOps.emplace_back().push_back(I); + ReducedVals.emplace_back().assign(Ops.begin(), Ops.end()); + for (Value *V : Ops) + ReducedValsToOps[V].push_back(I); + } + + bool matchReductionForOperands() const { + // Analyze "regular" integer/FP types for reductions - no target-specific + // types or pointers. + assert(ReductionRoot && "Reduction root is not set!"); + if (!isVectorizable(RdxKind, cast(ReductionRoot), + all_of(ReducedVals, [](ArrayRef Ops) { + return Ops.size() == 2; + }))) + return false; + + return true; + } /// Try to find a reduction tree. bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root, @@ -22187,7 +22215,6 @@ class HorizontalReduction { /// Attempt to vectorize the tree found by matchAssociativeReduction. Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI, const TargetLibraryInfo &TLI, AssumptionCache *AC) { - const unsigned ReductionLimit = VectorizeNonPowerOf2 ? 3 : 4; constexpr unsigned RegMaxNumber = 4; constexpr unsigned RedValsMaxNumber = 128; // If there are a sufficient number of reduction values, reduce @@ -23736,15 +23763,60 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) { Candidates.emplace_back(A1, B); } + auto TryToReduce = [this, &R, &TTI = *TTI](Instruction *Inst, + ArrayRef Ops) { + if (!isReductionCandidate(Inst)) + return false; + Type *Ty = Inst->getType(); + if (!isValidElementType(Ty) || Ty->isPointerTy()) + return false; + HorizontalReduction HorRdx(Inst, Ops); + if (!HorRdx.matchReductionForOperands()) + return false; + // Check the cost of operations. + VectorType *VecTy = getWidenedType(Ty, Ops.size()); + constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + InstructionCost ScalarCost = + TTI.getScalarizationOverhead( + VecTy, APInt::getAllOnes(getNumElements(VecTy)), /*Insert=*/false, + /*Extract=*/true, CostKind) + + TTI.getInstructionCost(Inst, CostKind); + InstructionCost RedCost; + switch (::getRdxKind(Inst)) { + case RecurKind::Add: + case RecurKind::Mul: + case RecurKind::Or: + case RecurKind::And: + case RecurKind::Xor: + case RecurKind::FAdd: + case RecurKind::FMul: { + FastMathFlags FMF; + if (auto *FPCI = dyn_cast(Inst)) + FMF = FPCI->getFastMathFlags(); + RedCost = TTI.getArithmeticReductionCost(Inst->getOpcode(), VecTy, FMF, + CostKind); + break; + } + default: + return false; + } + if (RedCost >= ScalarCost) + return false; + + return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC) != nullptr; + }; if (Candidates.size() == 1) - return tryToVectorizeList({Op0, Op1}, R); + return TryToReduce(I, {Op0, Op1}) || tryToVectorizeList({Op0, Op1}, R); // We have multiple options. Try to pick the single best. std::optional BestCandidate = R.findBestRootPair(Candidates); if (!BestCandidate) return false; - return tryToVectorizeList( - {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second}, R); + return TryToReduce(I, {Candidates[*BestCandidate].first, + Candidates[*BestCandidate].second}) || + tryToVectorizeList({Candidates[*BestCandidate].first, + Candidates[*BestCandidate].second}, + R); } bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root, diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll index 19b6d82818532..442769937ac12 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll @@ -16,9 +16,7 @@ define void @test1(ptr nocapture readonly %J, i32 %xmin, i32 %ymin) { ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr [[J:%.*]], align 4 ; CHECK-NEXT: [[TMP5:%.*]] = fsub fast <2 x float> [[TMP2]], [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = fmul fast <2 x float> [[TMP5]], [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[TMP6]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[TMP6]], i32 1 -; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[ADD:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP6]]) ; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq float [[ADD]], 0.000000e+00 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY3_LR_PH]], label [[FOR_END27:%.*]] ; CHECK: for.end27: @@ -57,9 +55,7 @@ define void @test2(ptr nocapture readonly %J, i32 %xmin, i32 %ymin) { ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr [[J:%.*]], align 4 ; CHECK-NEXT: [[TMP5:%.*]] = fsub fast <2 x float> [[TMP2]], [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = fmul fast <2 x float> [[TMP5]], [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[TMP6]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[TMP6]], i32 1 -; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP8]], [[TMP7]] +; CHECK-NEXT: [[ADD:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP6]]) ; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq float [[ADD]], 0.000000e+00 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY3_LR_PH]], label [[FOR_END27:%.*]] ; CHECK: for.end27: diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll index 677d52bf3b4c3..8d4a1152fe4da 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll @@ -3,13 +3,19 @@ ; RUN: opt < %s -S -passes=slp-vectorizer -mtriple=aarch64-unknown-linux -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK,FULLFP16 define half @reduce_fast_half2(<2 x half> %vec2) { -; CHECK-LABEL: define half @reduce_fast_half2( -; CHECK-SAME: <2 x half> [[VEC2:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[ELT0:%.*]] = extractelement <2 x half> [[VEC2]], i64 0 -; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x half> [[VEC2]], i64 1 -; CHECK-NEXT: [[ADD1:%.*]] = fadd fast half [[ELT1]], [[ELT0]] -; CHECK-NEXT: ret half [[ADD1]] +; NOFP16-LABEL: define half @reduce_fast_half2( +; NOFP16-SAME: <2 x half> [[VEC2:%.*]]) #[[ATTR0:[0-9]+]] { +; NOFP16-NEXT: [[ENTRY:.*:]] +; NOFP16-NEXT: [[ELT0:%.*]] = extractelement <2 x half> [[VEC2]], i64 0 +; NOFP16-NEXT: [[ELT1:%.*]] = extractelement <2 x half> [[VEC2]], i64 1 +; NOFP16-NEXT: [[ADD1:%.*]] = fadd fast half [[ELT1]], [[ELT0]] +; NOFP16-NEXT: ret half [[ADD1]] +; +; FULLFP16-LABEL: define half @reduce_fast_half2( +; FULLFP16-SAME: <2 x half> [[VEC2:%.*]]) #[[ATTR0:[0-9]+]] { +; FULLFP16-NEXT: [[ENTRY:.*:]] +; FULLFP16-NEXT: [[TMP0:%.*]] = call fast half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> [[VEC2]]) +; FULLFP16-NEXT: ret half [[TMP0]] ; entry: %elt0 = extractelement <2 x half> %vec2, i64 0 @@ -20,7 +26,7 @@ entry: define half @reduce_half2(<2 x half> %vec2) { ; CHECK-LABEL: define half @reduce_half2( -; CHECK-SAME: <2 x half> [[VEC2:%.*]]) #[[ATTR0]] { +; CHECK-SAME: <2 x half> [[VEC2:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[ELT0:%.*]] = extractelement <2 x half> [[VEC2]], i64 0 ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x half> [[VEC2]], i64 1 @@ -269,9 +275,7 @@ define float @reduce_fast_float2(<2 x float> %vec2) { ; CHECK-LABEL: define float @reduce_fast_float2( ; CHECK-SAME: <2 x float> [[VEC2:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[ELT0:%.*]] = extractelement <2 x float> [[VEC2]], i64 0 -; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[VEC2]], i64 1 -; CHECK-NEXT: [[ADD1:%.*]] = fadd fast float [[ELT1]], [[ELT0]] +; CHECK-NEXT: [[ADD1:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[VEC2]]) ; CHECK-NEXT: ret float [[ADD1]] ; entry: @@ -409,9 +413,7 @@ define double @reduce_fast_double2(<2 x double> %vec2) { ; CHECK-LABEL: define double @reduce_fast_double2( ; CHECK-SAME: <2 x double> [[VEC2:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[ELT0:%.*]] = extractelement <2 x double> [[VEC2]], i64 0 -; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x double> [[VEC2]], i64 1 -; CHECK-NEXT: [[ADD1:%.*]] = fadd fast double [[ELT1]], [[ELT0]] +; CHECK-NEXT: [[ADD1:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> [[VEC2]]) ; CHECK-NEXT: ret double [[ADD1]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll index 03f67ecb3e695..543f19225d74f 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll @@ -216,9 +216,7 @@ define void @slp_profitable_missing_fmf_nnans_only(ptr %A, ptr %B) { define float @slp_not_profitable_in_loop(float %x, ptr %A) { ; CHECK-LABEL: @slp_not_profitable_in_loop( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 2 -; CHECK-NEXT: [[L_1:%.*]] = load float, ptr [[GEP_A_2]], align 4 -; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[A]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[A:%.*]], align 4 ; CHECK-NEXT: [[L_3:%.*]] = load float, ptr [[A]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> , float [[X:%.*]], i32 0 ; CHECK-NEXT: br label [[LOOP:%.*]] @@ -226,12 +224,8 @@ define float @slp_not_profitable_in_loop(float %x, ptr %A) { ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[RED:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[RED_NEXT:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[TMP2:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP0]] -; CHECK-NEXT: [[MUL12:%.*]] = fmul fast float 3.000000e+00, [[L_1]] ; CHECK-NEXT: [[MUL16:%.*]] = fmul fast float 3.000000e+00, [[L_3]] -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 1 -; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[MUL12]], [[TMP3]] -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 0 -; CHECK-NEXT: [[ADD13:%.*]] = fadd fast float [[ADD]], [[TMP4]] +; CHECK-NEXT: [[ADD13:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP2]]) ; CHECK-NEXT: [[RED_NEXT]] = fadd fast float [[ADD13]], [[MUL16]] ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV]], 10 diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll index 651f565412830..1116f8a7fbe27 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll @@ -141,33 +141,21 @@ define ptr @test4() { ; POWEROF2-NEXT: [[TMP1:%.*]] = fadd <8 x float> zeroinitializer, zeroinitializer ; POWEROF2-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> ; POWEROF2-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> -; POWEROF2-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> ; POWEROF2-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> poison, <2 x float> [[TMP2]], i64 0) ; POWEROF2-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP5]], <2 x float> [[TMP3]], i64 2) ; POWEROF2-NEXT: br label [[TMP8:%.*]] -; POWEROF2: 7: +; POWEROF2: 6: ; POWEROF2-NEXT: br label [[TMP8]] -; POWEROF2: 8: -; POWEROF2-NEXT: [[TMP9:%.*]] = phi <2 x float> [ poison, [[TMP7:%.*]] ], [ [[TMP4]], [[TMP0:%.*]] ] -; POWEROF2-NEXT: [[TMP10:%.*]] = phi <4 x float> [ poison, [[TMP7]] ], [ [[TMP6]], [[TMP0]] ] +; POWEROF2: 7: +; POWEROF2-NEXT: [[TMP10:%.*]] = phi <4 x float> [ poison, [[TMP7:%.*]] ], [ [[TMP6]], [[TMP0:%.*]] ] ; POWEROF2-NEXT: br label [[TMP11:%.*]] -; POWEROF2: 11: +; POWEROF2: 9: ; POWEROF2-NEXT: [[TMP12:%.*]] = call <2 x float> @llvm.vector.extract.v2f32.v4f32(<4 x float> [[TMP10]], i64 0) ; POWEROF2-NEXT: [[TMP13:%.*]] = fmul <2 x float> [[TMP12]], zeroinitializer ; POWEROF2-NEXT: [[TMP14:%.*]] = call <2 x float> @llvm.vector.extract.v2f32.v4f32(<4 x float> [[TMP10]], i64 2) ; POWEROF2-NEXT: [[TMP15:%.*]] = fmul <2 x float> zeroinitializer, [[TMP14]] -; POWEROF2-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[TMP9]], i32 0 -; POWEROF2-NEXT: [[TMP17:%.*]] = fmul float 0.000000e+00, [[TMP18]] -; POWEROF2-NEXT: [[TMP30:%.*]] = extractelement <2 x float> [[TMP9]], i32 1 -; POWEROF2-NEXT: [[TMP19:%.*]] = fmul float [[TMP30]], 0.000000e+00 -; POWEROF2-NEXT: [[TMP20:%.*]] = extractelement <2 x float> [[TMP13]], i32 0 -; POWEROF2-NEXT: [[TMP21:%.*]] = fadd reassoc nsz float [[TMP20]], [[TMP17]] -; POWEROF2-NEXT: [[TMP22:%.*]] = extractelement <2 x float> [[TMP15]], i32 0 -; POWEROF2-NEXT: [[TMP23:%.*]] = fadd reassoc nsz float [[TMP22]], [[TMP19]] -; POWEROF2-NEXT: [[TMP24:%.*]] = extractelement <2 x float> [[TMP13]], i32 1 -; POWEROF2-NEXT: [[TMP25:%.*]] = fadd reassoc nsz float [[TMP21]], [[TMP24]] -; POWEROF2-NEXT: [[TMP26:%.*]] = extractelement <2 x float> [[TMP15]], i32 1 -; POWEROF2-NEXT: [[TMP27:%.*]] = fadd reassoc nsz float [[TMP23]], [[TMP26]] +; POWEROF2-NEXT: [[TMP25:%.*]] = call reassoc nsz float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP13]]) +; POWEROF2-NEXT: [[TMP27:%.*]] = call reassoc nsz float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP15]]) ; POWEROF2-NEXT: [[TMP28:%.*]] = tail call float @llvm.sqrt.f32(float [[TMP25]]) ; POWEROF2-NEXT: [[TMP29:%.*]] = tail call float @llvm.sqrt.f32(float [[TMP27]]) ; POWEROF2-NEXT: ret ptr null diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll index 481d586e6658a..27de36e601512 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll @@ -513,21 +513,16 @@ define i32 @dot_product_i32(ptr %a, ptr %b) { ; ; POW2-ONLY-LABEL: @dot_product_i32( ; POW2-ONLY-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 0 -; POW2-ONLY-NEXT: [[L_A_0:%.*]] = load i32, ptr [[GEP_A_0]], align 4 -; POW2-ONLY-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 1 -; POW2-ONLY-NEXT: [[L_A_1:%.*]] = load i32, ptr [[GEP_A_1]], align 4 ; POW2-ONLY-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 2 ; POW2-ONLY-NEXT: [[L_A_2:%.*]] = load i32, ptr [[GEP_A_2]], align 4 ; POW2-ONLY-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 0 -; POW2-ONLY-NEXT: [[L_B_0:%.*]] = load i32, ptr [[GEP_B_0]], align 4 -; POW2-ONLY-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 1 -; POW2-ONLY-NEXT: [[L_B_1:%.*]] = load i32, ptr [[GEP_B_1]], align 4 ; POW2-ONLY-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 2 ; POW2-ONLY-NEXT: [[L_B_2:%.*]] = load i32, ptr [[GEP_B_2]], align 4 -; POW2-ONLY-NEXT: [[MUL_0:%.*]] = mul nsw i32 [[L_A_0]], [[L_B_0]] -; POW2-ONLY-NEXT: [[MUL_1:%.*]] = mul nsw i32 [[L_A_1]], [[L_B_1]] +; POW2-ONLY-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[GEP_A_0]], align 4 +; POW2-ONLY-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[GEP_B_0]], align 4 +; POW2-ONLY-NEXT: [[TMP3:%.*]] = mul nsw <2 x i32> [[TMP1]], [[TMP2]] ; POW2-ONLY-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[L_A_2]], [[L_B_2]] -; POW2-ONLY-NEXT: [[ADD_0:%.*]] = add i32 [[MUL_0]], [[MUL_1]] +; POW2-ONLY-NEXT: [[ADD_0:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[TMP3]]) ; POW2-ONLY-NEXT: [[ADD_1:%.*]] = add i32 [[ADD_0]], [[MUL_2]] ; POW2-ONLY-NEXT: ret i32 [[ADD_1]] ; @@ -568,21 +563,16 @@ define i32 @dot_product_i32_reorder(ptr %a, ptr %b) { ; ; POW2-ONLY-LABEL: @dot_product_i32_reorder( ; POW2-ONLY-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 0 -; POW2-ONLY-NEXT: [[L_A_0:%.*]] = load i32, ptr [[GEP_A_0]], align 4 -; POW2-ONLY-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 1 -; POW2-ONLY-NEXT: [[L_A_1:%.*]] = load i32, ptr [[GEP_A_1]], align 4 ; POW2-ONLY-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 2 ; POW2-ONLY-NEXT: [[L_A_2:%.*]] = load i32, ptr [[GEP_A_2]], align 4 ; POW2-ONLY-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 0 -; POW2-ONLY-NEXT: [[L_B_0:%.*]] = load i32, ptr [[GEP_B_0]], align 4 -; POW2-ONLY-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 1 -; POW2-ONLY-NEXT: [[L_B_1:%.*]] = load i32, ptr [[GEP_B_1]], align 4 ; POW2-ONLY-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 2 ; POW2-ONLY-NEXT: [[L_B_2:%.*]] = load i32, ptr [[GEP_B_2]], align 4 -; POW2-ONLY-NEXT: [[MUL_0:%.*]] = mul nsw i32 [[L_A_0]], [[L_B_0]] -; POW2-ONLY-NEXT: [[MUL_1:%.*]] = mul nsw i32 [[L_A_1]], [[L_B_1]] +; POW2-ONLY-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[GEP_A_0]], align 4 +; POW2-ONLY-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[GEP_B_0]], align 4 +; POW2-ONLY-NEXT: [[TMP3:%.*]] = mul nsw <2 x i32> [[TMP1]], [[TMP2]] ; POW2-ONLY-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[L_A_2]], [[L_B_2]] -; POW2-ONLY-NEXT: [[ADD_0:%.*]] = add i32 [[MUL_1]], [[MUL_0]] +; POW2-ONLY-NEXT: [[ADD_0:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[TMP3]]) ; POW2-ONLY-NEXT: [[ADD_1:%.*]] = add i32 [[ADD_0]], [[MUL_2]] ; POW2-ONLY-NEXT: ret i32 [[ADD_1]] ; @@ -630,9 +620,7 @@ define float @dot_product_fp32(ptr %a, ptr %b) { ; POW2-ONLY-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[GEP_B_0]], align 4 ; POW2-ONLY-NEXT: [[TMP3:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP2]] ; POW2-ONLY-NEXT: [[MUL_2:%.*]] = fmul fast float [[L_A_2]], [[L_B_2]] -; POW2-ONLY-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0 -; POW2-ONLY-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1 -; POW2-ONLY-NEXT: [[ADD_0:%.*]] = fadd fast float [[TMP4]], [[TMP5]] +; POW2-ONLY-NEXT: [[ADD_0:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP3]]) ; POW2-ONLY-NEXT: [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]] ; POW2-ONLY-NEXT: ret float [[ADD_1]] ; @@ -682,9 +670,7 @@ define float @dot_product_fp32_reorder(ptr %a, ptr %b) { ; POW2-ONLY-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[GEP_B_0]], align 4 ; POW2-ONLY-NEXT: [[TMP3:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP2]] ; POW2-ONLY-NEXT: [[MUL_2:%.*]] = fmul fast float [[L_A_2]], [[L_B_2]] -; POW2-ONLY-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0 -; POW2-ONLY-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1 -; POW2-ONLY-NEXT: [[ADD_0:%.*]] = fadd fast float [[TMP5]], [[TMP4]] +; POW2-ONLY-NEXT: [[ADD_0:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP3]]) ; POW2-ONLY-NEXT: [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]] ; POW2-ONLY-NEXT: ret float [[ADD_1]] ; @@ -733,9 +719,7 @@ define double @dot_product_fp64(ptr %a, ptr %b) { ; POW2-ONLY-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[GEP_B_0]], align 4 ; POW2-ONLY-NEXT: [[TMP3:%.*]] = fmul fast <2 x double> [[TMP1]], [[TMP2]] ; POW2-ONLY-NEXT: [[MUL_2:%.*]] = fmul fast double [[L_A_2]], [[L_B_2]] -; POW2-ONLY-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0 -; POW2-ONLY-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1 -; POW2-ONLY-NEXT: [[ADD_0:%.*]] = fadd fast double [[TMP4]], [[TMP5]] +; POW2-ONLY-NEXT: [[ADD_0:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> [[TMP3]]) ; POW2-ONLY-NEXT: [[ADD_1:%.*]] = fadd fast double [[ADD_0]], [[MUL_2]] ; POW2-ONLY-NEXT: ret double [[ADD_1]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll b/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll index f16c879c451c2..8bed947d7a44a 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7 -passes=slp-vectorizer -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE2 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE4 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX ; ; dot4(ptr x, ptr y) - ((xptr y[0])+(xptr y[1])+(xptr y[2])+(xptr y[3])) @@ -231,20 +231,44 @@ define float @dot3f32(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %pt } define double @dot3f64_fast(ptr dereferenceable(32) %ptrx, ptr dereferenceable(32) %ptry) { -; CHECK-LABEL: @dot3f64_fast( -; CHECK-NEXT: [[PTRX1:%.*]] = getelementptr inbounds double, ptr [[PTRX:%.*]], i64 1 -; CHECK-NEXT: [[PTRY1:%.*]] = getelementptr inbounds double, ptr [[PTRY:%.*]], i64 1 -; CHECK-NEXT: [[X0:%.*]] = load double, ptr [[PTRX]], align 4 -; CHECK-NEXT: [[Y0:%.*]] = load double, ptr [[PTRY]], align 4 -; CHECK-NEXT: [[MUL0:%.*]] = fmul double [[X0]], [[Y0]] -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[PTRX1]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[PTRY1]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0 -; CHECK-NEXT: [[DOT01:%.*]] = fadd fast double [[MUL0]], [[TMP4]] -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1 -; CHECK-NEXT: [[DOT012:%.*]] = fadd fast double [[DOT01]], [[TMP5]] -; CHECK-NEXT: ret double [[DOT012]] +; SSE2-LABEL: @dot3f64_fast( +; SSE2-NEXT: [[PTRX1:%.*]] = getelementptr inbounds double, ptr [[PTRX:%.*]], i64 1 +; SSE2-NEXT: [[PTRY1:%.*]] = getelementptr inbounds double, ptr [[PTRY:%.*]], i64 1 +; SSE2-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[PTRX1]], align 4 +; SSE2-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[PTRY1]], align 4 +; SSE2-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP1]], [[TMP2]] +; SSE2-NEXT: [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> [[TMP3]]) +; SSE2-NEXT: ret double [[TMP4]] +; +; SSE4-LABEL: @dot3f64_fast( +; SSE4-NEXT: [[PTRX1:%.*]] = getelementptr inbounds double, ptr [[PTRX:%.*]], i64 1 +; SSE4-NEXT: [[PTRY1:%.*]] = getelementptr inbounds double, ptr [[PTRY:%.*]], i64 1 +; SSE4-NEXT: [[X0:%.*]] = load double, ptr [[PTRX]], align 4 +; SSE4-NEXT: [[Y0:%.*]] = load double, ptr [[PTRY]], align 4 +; SSE4-NEXT: [[MUL0:%.*]] = fmul double [[X0]], [[Y0]] +; SSE4-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[PTRX1]], align 4 +; SSE4-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[PTRY1]], align 4 +; SSE4-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP1]], [[TMP2]] +; SSE4-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0 +; SSE4-NEXT: [[DOT01:%.*]] = fadd fast double [[MUL0]], [[TMP4]] +; SSE4-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1 +; SSE4-NEXT: [[DOT012:%.*]] = fadd fast double [[DOT01]], [[TMP5]] +; SSE4-NEXT: ret double [[DOT012]] +; +; AVX-LABEL: @dot3f64_fast( +; AVX-NEXT: [[PTRX1:%.*]] = getelementptr inbounds double, ptr [[PTRX:%.*]], i64 1 +; AVX-NEXT: [[PTRY1:%.*]] = getelementptr inbounds double, ptr [[PTRY:%.*]], i64 1 +; AVX-NEXT: [[X0:%.*]] = load double, ptr [[PTRX]], align 4 +; AVX-NEXT: [[Y0:%.*]] = load double, ptr [[PTRY]], align 4 +; AVX-NEXT: [[MUL0:%.*]] = fmul double [[X0]], [[Y0]] +; AVX-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[PTRX1]], align 4 +; AVX-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[PTRY1]], align 4 +; AVX-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP1]], [[TMP2]] +; AVX-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0 +; AVX-NEXT: [[DOT01:%.*]] = fadd fast double [[MUL0]], [[TMP4]] +; AVX-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1 +; AVX-NEXT: [[DOT012:%.*]] = fadd fast double [[DOT01]], [[TMP5]] +; AVX-NEXT: ret double [[DOT012]] ; %ptrx1 = getelementptr inbounds double, ptr %ptrx, i64 1 %ptry1 = getelementptr inbounds double, ptr %ptry, i64 1 @@ -265,20 +289,44 @@ define double @dot3f64_fast(ptr dereferenceable(32) %ptrx, ptr dereferenceable(3 } define float @dot3f32_fast(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %ptry) { -; CHECK-LABEL: @dot3f32_fast( -; CHECK-NEXT: [[PTRX1:%.*]] = getelementptr inbounds float, ptr [[PTRX:%.*]], i64 1 -; CHECK-NEXT: [[PTRY1:%.*]] = getelementptr inbounds float, ptr [[PTRY:%.*]], i64 1 -; CHECK-NEXT: [[X0:%.*]] = load float, ptr [[PTRX]], align 4 -; CHECK-NEXT: [[Y0:%.*]] = load float, ptr [[PTRY]], align 4 -; CHECK-NEXT: [[MUL0:%.*]] = fmul float [[X0]], [[Y0]] -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[PTRX1]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[PTRY1]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0 -; CHECK-NEXT: [[DOT01:%.*]] = fadd fast float [[MUL0]], [[TMP4]] -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1 -; CHECK-NEXT: [[DOT012:%.*]] = fadd fast float [[DOT01]], [[TMP5]] -; CHECK-NEXT: ret float [[DOT012]] +; SSE2-LABEL: @dot3f32_fast( +; SSE2-NEXT: [[PTRX1:%.*]] = getelementptr inbounds float, ptr [[PTRX:%.*]], i64 1 +; SSE2-NEXT: [[PTRY1:%.*]] = getelementptr inbounds float, ptr [[PTRY:%.*]], i64 1 +; SSE2-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[PTRX1]], align 4 +; SSE2-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[PTRY1]], align 4 +; SSE2-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[TMP1]], [[TMP2]] +; SSE2-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP3]]) +; SSE2-NEXT: ret float [[TMP4]] +; +; SSE4-LABEL: @dot3f32_fast( +; SSE4-NEXT: [[PTRX1:%.*]] = getelementptr inbounds float, ptr [[PTRX:%.*]], i64 1 +; SSE4-NEXT: [[PTRY1:%.*]] = getelementptr inbounds float, ptr [[PTRY:%.*]], i64 1 +; SSE4-NEXT: [[X0:%.*]] = load float, ptr [[PTRX]], align 4 +; SSE4-NEXT: [[Y0:%.*]] = load float, ptr [[PTRY]], align 4 +; SSE4-NEXT: [[MUL0:%.*]] = fmul float [[X0]], [[Y0]] +; SSE4-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[PTRX1]], align 4 +; SSE4-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[PTRY1]], align 4 +; SSE4-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[TMP1]], [[TMP2]] +; SSE4-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0 +; SSE4-NEXT: [[DOT01:%.*]] = fadd fast float [[MUL0]], [[TMP4]] +; SSE4-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1 +; SSE4-NEXT: [[DOT012:%.*]] = fadd fast float [[DOT01]], [[TMP5]] +; SSE4-NEXT: ret float [[DOT012]] +; +; AVX-LABEL: @dot3f32_fast( +; AVX-NEXT: [[PTRX1:%.*]] = getelementptr inbounds float, ptr [[PTRX:%.*]], i64 1 +; AVX-NEXT: [[PTRY1:%.*]] = getelementptr inbounds float, ptr [[PTRY:%.*]], i64 1 +; AVX-NEXT: [[X0:%.*]] = load float, ptr [[PTRX]], align 4 +; AVX-NEXT: [[Y0:%.*]] = load float, ptr [[PTRY]], align 4 +; AVX-NEXT: [[MUL0:%.*]] = fmul float [[X0]], [[Y0]] +; AVX-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[PTRX1]], align 4 +; AVX-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[PTRY1]], align 4 +; AVX-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[TMP1]], [[TMP2]] +; AVX-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0 +; AVX-NEXT: [[DOT01:%.*]] = fadd fast float [[MUL0]], [[TMP4]] +; AVX-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1 +; AVX-NEXT: [[DOT012:%.*]] = fadd fast float [[DOT01]], [[TMP5]] +; AVX-NEXT: ret float [[DOT012]] ; %ptrx1 = getelementptr inbounds float, ptr %ptrx, i64 1 %ptry1 = getelementptr inbounds float, ptr %ptry, i64 1 @@ -347,14 +395,30 @@ define float @dot2f32(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %pt } define double @dot2f64_fast(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %ptry) { -; CHECK-LABEL: @dot2f64_fast( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[PTRX:%.*]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[PTRY:%.*]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1 -; CHECK-NEXT: [[DOT01:%.*]] = fadd fast double [[TMP4]], [[TMP5]] -; CHECK-NEXT: ret double [[DOT01]] +; SSE2-LABEL: @dot2f64_fast( +; SSE2-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[PTRX:%.*]], align 4 +; SSE2-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[PTRY:%.*]], align 4 +; SSE2-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP1]], [[TMP2]] +; SSE2-NEXT: [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> [[TMP3]]) +; SSE2-NEXT: ret double [[TMP4]] +; +; SSE4-LABEL: @dot2f64_fast( +; SSE4-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[PTRX:%.*]], align 4 +; SSE4-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[PTRY:%.*]], align 4 +; SSE4-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP1]], [[TMP2]] +; SSE4-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0 +; SSE4-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1 +; SSE4-NEXT: [[DOT01:%.*]] = fadd fast double [[TMP4]], [[TMP5]] +; SSE4-NEXT: ret double [[DOT01]] +; +; AVX-LABEL: @dot2f64_fast( +; AVX-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[PTRX:%.*]], align 4 +; AVX-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[PTRY:%.*]], align 4 +; AVX-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP1]], [[TMP2]] +; AVX-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0 +; AVX-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1 +; AVX-NEXT: [[DOT01:%.*]] = fadd fast double [[TMP4]], [[TMP5]] +; AVX-NEXT: ret double [[DOT01]] ; %ptrx1 = getelementptr inbounds double, ptr %ptrx, i64 1 %ptry1 = getelementptr inbounds double, ptr %ptry, i64 1 @@ -369,14 +433,30 @@ define double @dot2f64_fast(ptr dereferenceable(16) %ptrx, ptr dereferenceable(1 } define float @dot2f32_fast(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %ptry) { -; CHECK-LABEL: @dot2f32_fast( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[PTRX:%.*]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[PTRY:%.*]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1 -; CHECK-NEXT: [[DOT01:%.*]] = fadd fast float [[TMP4]], [[TMP5]] -; CHECK-NEXT: ret float [[DOT01]] +; SSE2-LABEL: @dot2f32_fast( +; SSE2-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[PTRX:%.*]], align 4 +; SSE2-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[PTRY:%.*]], align 4 +; SSE2-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[TMP1]], [[TMP2]] +; SSE2-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP3]]) +; SSE2-NEXT: ret float [[TMP4]] +; +; SSE4-LABEL: @dot2f32_fast( +; SSE4-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[PTRX:%.*]], align 4 +; SSE4-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[PTRY:%.*]], align 4 +; SSE4-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[TMP1]], [[TMP2]] +; SSE4-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0 +; SSE4-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1 +; SSE4-NEXT: [[DOT01:%.*]] = fadd fast float [[TMP4]], [[TMP5]] +; SSE4-NEXT: ret float [[DOT01]] +; +; AVX-LABEL: @dot2f32_fast( +; AVX-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[PTRX:%.*]], align 4 +; AVX-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[PTRY:%.*]], align 4 +; AVX-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[TMP1]], [[TMP2]] +; AVX-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0 +; AVX-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1 +; AVX-NEXT: [[DOT01:%.*]] = fadd fast float [[TMP4]], [[TMP5]] +; AVX-NEXT: ret float [[DOT01]] ; %ptrx1 = getelementptr inbounds float, ptr %ptrx, i64 1 %ptry1 = getelementptr inbounds float, ptr %ptry, i64 1 diff --git a/llvm/test/Transforms/SLPVectorizer/extracts-with-undefs.ll b/llvm/test/Transforms/SLPVectorizer/extracts-with-undefs.ll index dca34b681032c..a64075db37ba1 100644 --- a/llvm/test/Transforms/SLPVectorizer/extracts-with-undefs.ll +++ b/llvm/test/Transforms/SLPVectorizer/extracts-with-undefs.ll @@ -9,9 +9,9 @@ define void @test() { ; CHECK: body: ; CHECK-NEXT: [[PHI1:%.*]] = phi double [ 0.000000e+00, [[ENTRY:%.*]] ], [ 0.000000e+00, [[BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x double> [ zeroinitializer, [[ENTRY]] ], [ zeroinitializer, [[BODY]] ] -; CHECK-NEXT: [[MUL_I478_I:%.*]] = fmul fast double [[PHI1]], 0.000000e+00 -; CHECK-NEXT: [[MUL7_I485_I:%.*]] = fmul fast double undef, 0.000000e+00 -; CHECK-NEXT: [[ADD8_I_I:%.*]] = fadd fast double [[MUL_I478_I]], [[MUL7_I485_I]] +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> , double [[PHI1]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = fmul fast <2 x double> , [[TMP8]] +; CHECK-NEXT: [[ADD8_I_I:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> [[TMP9]]) ; CHECK-NEXT: [[CMP42_I:%.*]] = fcmp fast ole double [[ADD8_I_I]], 0.000000e+00 ; CHECK-NEXT: br i1 false, label [[BODY]], label [[EXIT:%.*]] ; CHECK: exit: