Skip to content

[SLP] Emit reduction instead of 2 extracts + scalar op, when vectorizing operands #147583

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 77 additions & 5 deletions llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21722,6 +21722,8 @@ class HorizontalReduction {
/// Checks if the optimization of original scalar identity operations on
/// matched horizontal reductions is enabled and allowed.
bool IsSupportedHorRdxIdentityOp = false;
/// The minimum number of the reduced values.
const unsigned ReductionLimit = VectorizeNonPowerOf2 ? 3 : 4;
/// Contains vector values for reduction including their scale factor and
/// signedness.
SmallVector<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales;
Expand All @@ -21740,7 +21742,8 @@ class HorizontalReduction {
}

/// Checks if instruction is associative and can be vectorized.
static bool isVectorizable(RecurKind Kind, Instruction *I) {
static bool isVectorizable(RecurKind Kind, Instruction *I,
bool TwoElementReduction = false) {
if (Kind == RecurKind::None)
return false;

Expand All @@ -21749,6 +21752,10 @@ class HorizontalReduction {
isBoolLogicOp(I))
return true;

// No need to check for associativity, if 2 reduced values.
if (TwoElementReduction)
return true;

if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
// FP min/max are associative except for NaN and -0.0. We do not
// have to rule out -0.0 here because the intrinsic semantics do not
Expand Down Expand Up @@ -22020,6 +22027,27 @@ class HorizontalReduction {

public:
HorizontalReduction() = default;
HorizontalReduction(Instruction *I, ArrayRef<Value *> Ops)
: ReductionRoot(I), ReductionLimit(2) {
RdxKind = HorizontalReduction::getRdxKind(I);
ReductionOps.emplace_back().push_back(I);
ReducedVals.emplace_back().assign(Ops.begin(), Ops.end());
for (Value *V : Ops)
ReducedValsToOps[V].push_back(I);
}

bool matchReductionForOperands() const {
// Analyze "regular" integer/FP types for reductions - no target-specific
// types or pointers.
assert(ReductionRoot && "Reduction root is not set!");
if (!isVectorizable(RdxKind, cast<Instruction>(ReductionRoot),
all_of(ReducedVals, [](ArrayRef<Value *> Ops) {
return Ops.size() == 2;
})))
return false;

return true;
}

/// Try to find a reduction tree.
bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
Expand Down Expand Up @@ -22187,7 +22215,6 @@ class HorizontalReduction {
/// Attempt to vectorize the tree found by matchAssociativeReduction.
Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
const TargetLibraryInfo &TLI, AssumptionCache *AC) {
const unsigned ReductionLimit = VectorizeNonPowerOf2 ? 3 : 4;
constexpr unsigned RegMaxNumber = 4;
constexpr unsigned RedValsMaxNumber = 128;
// If there are a sufficient number of reduction values, reduce
Expand Down Expand Up @@ -23736,15 +23763,60 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
Candidates.emplace_back(A1, B);
}

auto TryToReduce = [this, &R, &TTI = *TTI](Instruction *Inst,
ArrayRef<Value *> Ops) {
if (!isReductionCandidate(Inst))
return false;
Type *Ty = Inst->getType();
if (!isValidElementType(Ty) || Ty->isPointerTy())
return false;
HorizontalReduction HorRdx(Inst, Ops);
if (!HorRdx.matchReductionForOperands())
return false;
// Check the cost of operations.
VectorType *VecTy = getWidenedType(Ty, Ops.size());
constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
InstructionCost ScalarCost =
TTI.getScalarizationOverhead(
VecTy, APInt::getAllOnes(getNumElements(VecTy)), /*Insert=*/false,
/*Extract=*/true, CostKind) +
TTI.getInstructionCost(Inst, CostKind);
InstructionCost RedCost;
switch (::getRdxKind(Inst)) {
case RecurKind::Add:
case RecurKind::Mul:
case RecurKind::Or:
case RecurKind::And:
case RecurKind::Xor:
case RecurKind::FAdd:
case RecurKind::FMul: {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can't the integer min/max kinds be included here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We do not vectorize it here for now, just binaryoperations and compares

FastMathFlags FMF;
if (auto *FPCI = dyn_cast<FPMathOperator>(Inst))
FMF = FPCI->getFastMathFlags();
RedCost = TTI.getArithmeticReductionCost(Inst->getOpcode(), VecTy, FMF,
CostKind);
break;
}
default:
return false;
}
if (RedCost >= ScalarCost)
return false;

return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC) != nullptr;
};
if (Candidates.size() == 1)
return tryToVectorizeList({Op0, Op1}, R);
return TryToReduce(I, {Op0, Op1}) || tryToVectorizeList({Op0, Op1}, R);

// We have multiple options. Try to pick the single best.
std::optional<int> BestCandidate = R.findBestRootPair(Candidates);
if (!BestCandidate)
return false;
return tryToVectorizeList(
{Candidates[*BestCandidate].first, Candidates[*BestCandidate].second}, R);
return TryToReduce(I, {Candidates[*BestCandidate].first,
Candidates[*BestCandidate].second}) ||
tryToVectorizeList({Candidates[*BestCandidate].first,
Candidates[*BestCandidate].second},
R);
}

bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root,
Expand Down
8 changes: 2 additions & 6 deletions llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,7 @@ define void @test1(ptr nocapture readonly %J, i32 %xmin, i32 %ymin) {
; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr [[J:%.*]], align 4
; CHECK-NEXT: [[TMP5:%.*]] = fsub fast <2 x float> [[TMP2]], [[TMP4]]
; CHECK-NEXT: [[TMP6:%.*]] = fmul fast <2 x float> [[TMP5]], [[TMP5]]
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[TMP6]], i32 0
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[TMP6]], i32 1
; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP7]], [[TMP8]]
; CHECK-NEXT: [[ADD:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP6]])
; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq float [[ADD]], 0.000000e+00
; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY3_LR_PH]], label [[FOR_END27:%.*]]
; CHECK: for.end27:
Expand Down Expand Up @@ -57,9 +55,7 @@ define void @test2(ptr nocapture readonly %J, i32 %xmin, i32 %ymin) {
; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr [[J:%.*]], align 4
; CHECK-NEXT: [[TMP5:%.*]] = fsub fast <2 x float> [[TMP2]], [[TMP4]]
; CHECK-NEXT: [[TMP6:%.*]] = fmul fast <2 x float> [[TMP5]], [[TMP5]]
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[TMP6]], i32 0
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[TMP6]], i32 1
; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP8]], [[TMP7]]
; CHECK-NEXT: [[ADD:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP6]])
; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq float [[ADD]], 0.000000e+00
; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY3_LR_PH]], label [[FOR_END27:%.*]]
; CHECK: for.end27:
Expand Down
30 changes: 16 additions & 14 deletions llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,19 @@
; RUN: opt < %s -S -passes=slp-vectorizer -mtriple=aarch64-unknown-linux -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK,FULLFP16

define half @reduce_fast_half2(<2 x half> %vec2) {
; CHECK-LABEL: define half @reduce_fast_half2(
; CHECK-SAME: <2 x half> [[VEC2:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[ELT0:%.*]] = extractelement <2 x half> [[VEC2]], i64 0
; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x half> [[VEC2]], i64 1
; CHECK-NEXT: [[ADD1:%.*]] = fadd fast half [[ELT1]], [[ELT0]]
; CHECK-NEXT: ret half [[ADD1]]
; NOFP16-LABEL: define half @reduce_fast_half2(
; NOFP16-SAME: <2 x half> [[VEC2:%.*]]) #[[ATTR0:[0-9]+]] {
; NOFP16-NEXT: [[ENTRY:.*:]]
; NOFP16-NEXT: [[ELT0:%.*]] = extractelement <2 x half> [[VEC2]], i64 0
; NOFP16-NEXT: [[ELT1:%.*]] = extractelement <2 x half> [[VEC2]], i64 1
; NOFP16-NEXT: [[ADD1:%.*]] = fadd fast half [[ELT1]], [[ELT0]]
; NOFP16-NEXT: ret half [[ADD1]]
;
; FULLFP16-LABEL: define half @reduce_fast_half2(
; FULLFP16-SAME: <2 x half> [[VEC2:%.*]]) #[[ATTR0:[0-9]+]] {
; FULLFP16-NEXT: [[ENTRY:.*:]]
; FULLFP16-NEXT: [[TMP0:%.*]] = call fast half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> [[VEC2]])
; FULLFP16-NEXT: ret half [[TMP0]]
;
entry:
%elt0 = extractelement <2 x half> %vec2, i64 0
Expand All @@ -20,7 +26,7 @@ entry:

define half @reduce_half2(<2 x half> %vec2) {
; CHECK-LABEL: define half @reduce_half2(
; CHECK-SAME: <2 x half> [[VEC2:%.*]]) #[[ATTR0]] {
; CHECK-SAME: <2 x half> [[VEC2:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[ELT0:%.*]] = extractelement <2 x half> [[VEC2]], i64 0
; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x half> [[VEC2]], i64 1
Expand Down Expand Up @@ -269,9 +275,7 @@ define float @reduce_fast_float2(<2 x float> %vec2) {
; CHECK-LABEL: define float @reduce_fast_float2(
; CHECK-SAME: <2 x float> [[VEC2:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[ELT0:%.*]] = extractelement <2 x float> [[VEC2]], i64 0
; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[VEC2]], i64 1
; CHECK-NEXT: [[ADD1:%.*]] = fadd fast float [[ELT1]], [[ELT0]]
; CHECK-NEXT: [[ADD1:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[VEC2]])
; CHECK-NEXT: ret float [[ADD1]]
;
entry:
Expand Down Expand Up @@ -409,9 +413,7 @@ define double @reduce_fast_double2(<2 x double> %vec2) {
; CHECK-LABEL: define double @reduce_fast_double2(
; CHECK-SAME: <2 x double> [[VEC2:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[ELT0:%.*]] = extractelement <2 x double> [[VEC2]], i64 0
; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x double> [[VEC2]], i64 1
; CHECK-NEXT: [[ADD1:%.*]] = fadd fast double [[ELT1]], [[ELT0]]
; CHECK-NEXT: [[ADD1:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> [[VEC2]])
; CHECK-NEXT: ret double [[ADD1]]
;
entry:
Expand Down
10 changes: 2 additions & 8 deletions llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll
Original file line number Diff line number Diff line change
Expand Up @@ -216,22 +216,16 @@ define void @slp_profitable_missing_fmf_nnans_only(ptr %A, ptr %B) {
define float @slp_not_profitable_in_loop(float %x, ptr %A) {
; CHECK-LABEL: @slp_not_profitable_in_loop(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 2
; CHECK-NEXT: [[L_1:%.*]] = load float, ptr [[GEP_A_2]], align 4
; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[A]], align 4
; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[A:%.*]], align 4
; CHECK-NEXT: [[L_3:%.*]] = load float, ptr [[A]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> <float poison, float 3.000000e+00>, float [[X:%.*]], i32 0
; CHECK-NEXT: br label [[LOOP:%.*]]
; CHECK: loop:
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
; CHECK-NEXT: [[RED:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[RED_NEXT:%.*]], [[LOOP]] ]
; CHECK-NEXT: [[TMP2:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP0]]
; CHECK-NEXT: [[MUL12:%.*]] = fmul fast float 3.000000e+00, [[L_1]]
; CHECK-NEXT: [[MUL16:%.*]] = fmul fast float 3.000000e+00, [[L_3]]
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[MUL12]], [[TMP3]]
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
; CHECK-NEXT: [[ADD13:%.*]] = fadd fast float [[ADD]], [[TMP4]]
; CHECK-NEXT: [[ADD13:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP2]])
; CHECK-NEXT: [[RED_NEXT]] = fadd fast float [[ADD13]], [[MUL16]]
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV]], 10
Expand Down
24 changes: 6 additions & 18 deletions llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
Original file line number Diff line number Diff line change
Expand Up @@ -141,33 +141,21 @@ define ptr @test4() {
; POWEROF2-NEXT: [[TMP1:%.*]] = fadd <8 x float> zeroinitializer, zeroinitializer
; POWEROF2-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
; POWEROF2-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 5, i32 6>
; POWEROF2-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 0, i32 4>
; POWEROF2-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> poison, <2 x float> [[TMP2]], i64 0)
; POWEROF2-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP5]], <2 x float> [[TMP3]], i64 2)
; POWEROF2-NEXT: br label [[TMP8:%.*]]
; POWEROF2: 7:
; POWEROF2: 6:
; POWEROF2-NEXT: br label [[TMP8]]
; POWEROF2: 8:
; POWEROF2-NEXT: [[TMP9:%.*]] = phi <2 x float> [ poison, [[TMP7:%.*]] ], [ [[TMP4]], [[TMP0:%.*]] ]
; POWEROF2-NEXT: [[TMP10:%.*]] = phi <4 x float> [ poison, [[TMP7]] ], [ [[TMP6]], [[TMP0]] ]
; POWEROF2: 7:
; POWEROF2-NEXT: [[TMP10:%.*]] = phi <4 x float> [ poison, [[TMP7:%.*]] ], [ [[TMP6]], [[TMP0:%.*]] ]
; POWEROF2-NEXT: br label [[TMP11:%.*]]
; POWEROF2: 11:
; POWEROF2: 9:
; POWEROF2-NEXT: [[TMP12:%.*]] = call <2 x float> @llvm.vector.extract.v2f32.v4f32(<4 x float> [[TMP10]], i64 0)
; POWEROF2-NEXT: [[TMP13:%.*]] = fmul <2 x float> [[TMP12]], zeroinitializer
; POWEROF2-NEXT: [[TMP14:%.*]] = call <2 x float> @llvm.vector.extract.v2f32.v4f32(<4 x float> [[TMP10]], i64 2)
; POWEROF2-NEXT: [[TMP15:%.*]] = fmul <2 x float> zeroinitializer, [[TMP14]]
; POWEROF2-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[TMP9]], i32 0
; POWEROF2-NEXT: [[TMP17:%.*]] = fmul float 0.000000e+00, [[TMP18]]
; POWEROF2-NEXT: [[TMP30:%.*]] = extractelement <2 x float> [[TMP9]], i32 1
; POWEROF2-NEXT: [[TMP19:%.*]] = fmul float [[TMP30]], 0.000000e+00
; POWEROF2-NEXT: [[TMP20:%.*]] = extractelement <2 x float> [[TMP13]], i32 0
; POWEROF2-NEXT: [[TMP21:%.*]] = fadd reassoc nsz float [[TMP20]], [[TMP17]]
; POWEROF2-NEXT: [[TMP22:%.*]] = extractelement <2 x float> [[TMP15]], i32 0
; POWEROF2-NEXT: [[TMP23:%.*]] = fadd reassoc nsz float [[TMP22]], [[TMP19]]
; POWEROF2-NEXT: [[TMP24:%.*]] = extractelement <2 x float> [[TMP13]], i32 1
; POWEROF2-NEXT: [[TMP25:%.*]] = fadd reassoc nsz float [[TMP21]], [[TMP24]]
; POWEROF2-NEXT: [[TMP26:%.*]] = extractelement <2 x float> [[TMP15]], i32 1
; POWEROF2-NEXT: [[TMP27:%.*]] = fadd reassoc nsz float [[TMP23]], [[TMP26]]
; POWEROF2-NEXT: [[TMP25:%.*]] = call reassoc nsz float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP13]])
; POWEROF2-NEXT: [[TMP27:%.*]] = call reassoc nsz float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP15]])
; POWEROF2-NEXT: [[TMP28:%.*]] = tail call float @llvm.sqrt.f32(float [[TMP25]])
; POWEROF2-NEXT: [[TMP29:%.*]] = tail call float @llvm.sqrt.f32(float [[TMP27]])
; POWEROF2-NEXT: ret ptr null
Expand Down
Loading