Skip to content

Commit 81ac2e2

Browse files
asbgithub-actions[bot]
authored andcommitted
Automerge: Revert "[SLP] Emit reduction instead of 2 extracts + scalar op, when vectorizing operands (#147583)"
This reverts commit ac4a38e. This breaks the RVV builders (MicroBenchmarks/ImageProcessing/Blur/blur.test and MultiSource/Benchmarks/tramp3d-v4/tramp3d-v4.test from llvm-test-suite) and reportedly SPEC Accel2023 <llvm/llvm-project#147583 (comment)>.
2 parents 719ebae + 18627e9 commit 81ac2e2

File tree

8 files changed

+129
-245
lines changed

8 files changed

+129
-245
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 5 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -21722,8 +21722,6 @@ class HorizontalReduction {
2172221722
/// Checks if the optimization of original scalar identity operations on
2172321723
/// matched horizontal reductions is enabled and allowed.
2172421724
bool IsSupportedHorRdxIdentityOp = false;
21725-
/// The minimum number of the reduced values.
21726-
const unsigned ReductionLimit = VectorizeNonPowerOf2 ? 3 : 4;
2172721725
/// Contains vector values for reduction including their scale factor and
2172821726
/// signedness.
2172921727
SmallVector<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales;
@@ -21742,8 +21740,7 @@ class HorizontalReduction {
2174221740
}
2174321741

2174421742
/// Checks if instruction is associative and can be vectorized.
21745-
static bool isVectorizable(RecurKind Kind, Instruction *I,
21746-
bool TwoElementReduction = false) {
21743+
static bool isVectorizable(RecurKind Kind, Instruction *I) {
2174721744
if (Kind == RecurKind::None)
2174821745
return false;
2174921746

@@ -21752,10 +21749,6 @@ class HorizontalReduction {
2175221749
isBoolLogicOp(I))
2175321750
return true;
2175421751

21755-
// No need to check for associativity, if 2 reduced values.
21756-
if (TwoElementReduction)
21757-
return true;
21758-
2175921752
if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
2176021753
// FP min/max are associative except for NaN and -0.0. We do not
2176121754
// have to rule out -0.0 here because the intrinsic semantics do not
@@ -22027,27 +22020,6 @@ class HorizontalReduction {
2202722020

2202822021
public:
2202922022
HorizontalReduction() = default;
22030-
HorizontalReduction(Instruction *I, ArrayRef<Value *> Ops)
22031-
: ReductionRoot(I), ReductionLimit(2) {
22032-
RdxKind = HorizontalReduction::getRdxKind(I);
22033-
ReductionOps.emplace_back().push_back(I);
22034-
ReducedVals.emplace_back().assign(Ops.begin(), Ops.end());
22035-
for (Value *V : Ops)
22036-
ReducedValsToOps[V].push_back(I);
22037-
}
22038-
22039-
bool matchReductionForOperands() const {
22040-
// Analyze "regular" integer/FP types for reductions - no target-specific
22041-
// types or pointers.
22042-
assert(ReductionRoot && "Reduction root is not set!");
22043-
if (!isVectorizable(RdxKind, cast<Instruction>(ReductionRoot),
22044-
all_of(ReducedVals, [](ArrayRef<Value *> Ops) {
22045-
return Ops.size() == 2;
22046-
})))
22047-
return false;
22048-
22049-
return true;
22050-
}
2205122023

2205222024
/// Try to find a reduction tree.
2205322025
bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
@@ -22215,6 +22187,7 @@ class HorizontalReduction {
2221522187
/// Attempt to vectorize the tree found by matchAssociativeReduction.
2221622188
Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
2221722189
const TargetLibraryInfo &TLI, AssumptionCache *AC) {
22190+
const unsigned ReductionLimit = VectorizeNonPowerOf2 ? 3 : 4;
2221822191
constexpr unsigned RegMaxNumber = 4;
2221922192
constexpr unsigned RedValsMaxNumber = 128;
2222022193
// If there are a sufficient number of reduction values, reduce
@@ -23763,60 +23736,15 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
2376323736
Candidates.emplace_back(A1, B);
2376423737
}
2376523738

23766-
auto TryToReduce = [this, &R, &TTI = *TTI](Instruction *Inst,
23767-
ArrayRef<Value *> Ops) {
23768-
if (!isReductionCandidate(Inst))
23769-
return false;
23770-
Type *Ty = Inst->getType();
23771-
if (!isValidElementType(Ty) || Ty->isPointerTy())
23772-
return false;
23773-
HorizontalReduction HorRdx(Inst, Ops);
23774-
if (!HorRdx.matchReductionForOperands())
23775-
return false;
23776-
// Check the cost of operations.
23777-
VectorType *VecTy = getWidenedType(Ty, Ops.size());
23778-
constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
23779-
InstructionCost ScalarCost =
23780-
TTI.getScalarizationOverhead(
23781-
VecTy, APInt::getAllOnes(getNumElements(VecTy)), /*Insert=*/false,
23782-
/*Extract=*/true, CostKind) +
23783-
TTI.getInstructionCost(Inst, CostKind);
23784-
InstructionCost RedCost;
23785-
switch (::getRdxKind(Inst)) {
23786-
case RecurKind::Add:
23787-
case RecurKind::Mul:
23788-
case RecurKind::Or:
23789-
case RecurKind::And:
23790-
case RecurKind::Xor:
23791-
case RecurKind::FAdd:
23792-
case RecurKind::FMul: {
23793-
FastMathFlags FMF;
23794-
if (auto *FPCI = dyn_cast<FPMathOperator>(Inst))
23795-
FMF = FPCI->getFastMathFlags();
23796-
RedCost = TTI.getArithmeticReductionCost(Inst->getOpcode(), VecTy, FMF,
23797-
CostKind);
23798-
break;
23799-
}
23800-
default:
23801-
return false;
23802-
}
23803-
if (RedCost >= ScalarCost)
23804-
return false;
23805-
23806-
return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC) != nullptr;
23807-
};
2380823739
if (Candidates.size() == 1)
23809-
return TryToReduce(I, {Op0, Op1}) || tryToVectorizeList({Op0, Op1}, R);
23740+
return tryToVectorizeList({Op0, Op1}, R);
2381023741

2381123742
// We have multiple options. Try to pick the single best.
2381223743
std::optional<int> BestCandidate = R.findBestRootPair(Candidates);
2381323744
if (!BestCandidate)
2381423745
return false;
23815-
return TryToReduce(I, {Candidates[*BestCandidate].first,
23816-
Candidates[*BestCandidate].second}) ||
23817-
tryToVectorizeList({Candidates[*BestCandidate].first,
23818-
Candidates[*BestCandidate].second},
23819-
R);
23746+
return tryToVectorizeList(
23747+
{Candidates[*BestCandidate].first, Candidates[*BestCandidate].second}, R);
2382023748
}
2382123749

2382223750
bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root,

llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,9 @@ define void @test1(ptr nocapture readonly %J, i32 %xmin, i32 %ymin) {
1616
; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr [[J:%.*]], align 4
1717
; CHECK-NEXT: [[TMP5:%.*]] = fsub fast <2 x float> [[TMP2]], [[TMP4]]
1818
; CHECK-NEXT: [[TMP6:%.*]] = fmul fast <2 x float> [[TMP5]], [[TMP5]]
19-
; CHECK-NEXT: [[ADD:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP6]])
19+
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[TMP6]], i32 0
20+
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[TMP6]], i32 1
21+
; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP7]], [[TMP8]]
2022
; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq float [[ADD]], 0.000000e+00
2123
; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY3_LR_PH]], label [[FOR_END27:%.*]]
2224
; CHECK: for.end27:
@@ -55,7 +57,9 @@ define void @test2(ptr nocapture readonly %J, i32 %xmin, i32 %ymin) {
5557
; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr [[J:%.*]], align 4
5658
; CHECK-NEXT: [[TMP5:%.*]] = fsub fast <2 x float> [[TMP2]], [[TMP4]]
5759
; CHECK-NEXT: [[TMP6:%.*]] = fmul fast <2 x float> [[TMP5]], [[TMP5]]
58-
; CHECK-NEXT: [[ADD:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP6]])
60+
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[TMP6]], i32 0
61+
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[TMP6]], i32 1
62+
; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP8]], [[TMP7]]
5963
; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq float [[ADD]], 0.000000e+00
6064
; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY3_LR_PH]], label [[FOR_END27:%.*]]
6165
; CHECK: for.end27:

llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll

Lines changed: 14 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -3,19 +3,13 @@
33
; RUN: opt < %s -S -passes=slp-vectorizer -mtriple=aarch64-unknown-linux -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK,FULLFP16
44

55
define half @reduce_fast_half2(<2 x half> %vec2) {
6-
; NOFP16-LABEL: define half @reduce_fast_half2(
7-
; NOFP16-SAME: <2 x half> [[VEC2:%.*]]) #[[ATTR0:[0-9]+]] {
8-
; NOFP16-NEXT: [[ENTRY:.*:]]
9-
; NOFP16-NEXT: [[ELT0:%.*]] = extractelement <2 x half> [[VEC2]], i64 0
10-
; NOFP16-NEXT: [[ELT1:%.*]] = extractelement <2 x half> [[VEC2]], i64 1
11-
; NOFP16-NEXT: [[ADD1:%.*]] = fadd fast half [[ELT1]], [[ELT0]]
12-
; NOFP16-NEXT: ret half [[ADD1]]
13-
;
14-
; FULLFP16-LABEL: define half @reduce_fast_half2(
15-
; FULLFP16-SAME: <2 x half> [[VEC2:%.*]]) #[[ATTR0:[0-9]+]] {
16-
; FULLFP16-NEXT: [[ENTRY:.*:]]
17-
; FULLFP16-NEXT: [[TMP0:%.*]] = call fast half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> [[VEC2]])
18-
; FULLFP16-NEXT: ret half [[TMP0]]
6+
; CHECK-LABEL: define half @reduce_fast_half2(
7+
; CHECK-SAME: <2 x half> [[VEC2:%.*]]) #[[ATTR0:[0-9]+]] {
8+
; CHECK-NEXT: [[ENTRY:.*:]]
9+
; CHECK-NEXT: [[ELT0:%.*]] = extractelement <2 x half> [[VEC2]], i64 0
10+
; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x half> [[VEC2]], i64 1
11+
; CHECK-NEXT: [[ADD1:%.*]] = fadd fast half [[ELT1]], [[ELT0]]
12+
; CHECK-NEXT: ret half [[ADD1]]
1913
;
2014
entry:
2115
%elt0 = extractelement <2 x half> %vec2, i64 0
@@ -26,7 +20,7 @@ entry:
2620

2721
define half @reduce_half2(<2 x half> %vec2) {
2822
; CHECK-LABEL: define half @reduce_half2(
29-
; CHECK-SAME: <2 x half> [[VEC2:%.*]]) #[[ATTR0:[0-9]+]] {
23+
; CHECK-SAME: <2 x half> [[VEC2:%.*]]) #[[ATTR0]] {
3024
; CHECK-NEXT: [[ENTRY:.*:]]
3125
; CHECK-NEXT: [[ELT0:%.*]] = extractelement <2 x half> [[VEC2]], i64 0
3226
; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x half> [[VEC2]], i64 1
@@ -275,7 +269,9 @@ define float @reduce_fast_float2(<2 x float> %vec2) {
275269
; CHECK-LABEL: define float @reduce_fast_float2(
276270
; CHECK-SAME: <2 x float> [[VEC2:%.*]]) #[[ATTR0]] {
277271
; CHECK-NEXT: [[ENTRY:.*:]]
278-
; CHECK-NEXT: [[ADD1:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[VEC2]])
272+
; CHECK-NEXT: [[ELT0:%.*]] = extractelement <2 x float> [[VEC2]], i64 0
273+
; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[VEC2]], i64 1
274+
; CHECK-NEXT: [[ADD1:%.*]] = fadd fast float [[ELT1]], [[ELT0]]
279275
; CHECK-NEXT: ret float [[ADD1]]
280276
;
281277
entry:
@@ -413,7 +409,9 @@ define double @reduce_fast_double2(<2 x double> %vec2) {
413409
; CHECK-LABEL: define double @reduce_fast_double2(
414410
; CHECK-SAME: <2 x double> [[VEC2:%.*]]) #[[ATTR0]] {
415411
; CHECK-NEXT: [[ENTRY:.*:]]
416-
; CHECK-NEXT: [[ADD1:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> [[VEC2]])
412+
; CHECK-NEXT: [[ELT0:%.*]] = extractelement <2 x double> [[VEC2]], i64 0
413+
; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x double> [[VEC2]], i64 1
414+
; CHECK-NEXT: [[ADD1:%.*]] = fadd fast double [[ELT1]], [[ELT0]]
417415
; CHECK-NEXT: ret double [[ADD1]]
418416
;
419417
entry:

llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -216,16 +216,22 @@ define void @slp_profitable_missing_fmf_nnans_only(ptr %A, ptr %B) {
216216
define float @slp_not_profitable_in_loop(float %x, ptr %A) {
217217
; CHECK-LABEL: @slp_not_profitable_in_loop(
218218
; CHECK-NEXT: entry:
219-
; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[A:%.*]], align 4
219+
; CHECK-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 2
220+
; CHECK-NEXT: [[L_1:%.*]] = load float, ptr [[GEP_A_2]], align 4
221+
; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[A]], align 4
220222
; CHECK-NEXT: [[L_3:%.*]] = load float, ptr [[A]], align 4
221223
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> <float poison, float 3.000000e+00>, float [[X:%.*]], i32 0
222224
; CHECK-NEXT: br label [[LOOP:%.*]]
223225
; CHECK: loop:
224226
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
225227
; CHECK-NEXT: [[RED:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[RED_NEXT:%.*]], [[LOOP]] ]
226228
; CHECK-NEXT: [[TMP2:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP0]]
229+
; CHECK-NEXT: [[MUL12:%.*]] = fmul fast float 3.000000e+00, [[L_1]]
227230
; CHECK-NEXT: [[MUL16:%.*]] = fmul fast float 3.000000e+00, [[L_3]]
228-
; CHECK-NEXT: [[ADD13:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP2]])
231+
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
232+
; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[MUL12]], [[TMP3]]
233+
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
234+
; CHECK-NEXT: [[ADD13:%.*]] = fadd fast float [[ADD]], [[TMP4]]
229235
; CHECK-NEXT: [[RED_NEXT]] = fadd fast float [[ADD13]], [[MUL16]]
230236
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
231237
; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV]], 10

llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -141,21 +141,33 @@ define ptr @test4() {
141141
; POWEROF2-NEXT: [[TMP1:%.*]] = fadd <8 x float> zeroinitializer, zeroinitializer
142142
; POWEROF2-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
143143
; POWEROF2-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 5, i32 6>
144+
; POWEROF2-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 0, i32 4>
144145
; POWEROF2-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> poison, <2 x float> [[TMP2]], i64 0)
145146
; POWEROF2-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP5]], <2 x float> [[TMP3]], i64 2)
146147
; POWEROF2-NEXT: br label [[TMP8:%.*]]
147-
; POWEROF2: 6:
148-
; POWEROF2-NEXT: br label [[TMP8]]
149148
; POWEROF2: 7:
150-
; POWEROF2-NEXT: [[TMP10:%.*]] = phi <4 x float> [ poison, [[TMP7:%.*]] ], [ [[TMP6]], [[TMP0:%.*]] ]
149+
; POWEROF2-NEXT: br label [[TMP8]]
150+
; POWEROF2: 8:
151+
; POWEROF2-NEXT: [[TMP9:%.*]] = phi <2 x float> [ poison, [[TMP7:%.*]] ], [ [[TMP4]], [[TMP0:%.*]] ]
152+
; POWEROF2-NEXT: [[TMP10:%.*]] = phi <4 x float> [ poison, [[TMP7]] ], [ [[TMP6]], [[TMP0]] ]
151153
; POWEROF2-NEXT: br label [[TMP11:%.*]]
152-
; POWEROF2: 9:
154+
; POWEROF2: 11:
153155
; POWEROF2-NEXT: [[TMP12:%.*]] = call <2 x float> @llvm.vector.extract.v2f32.v4f32(<4 x float> [[TMP10]], i64 0)
154156
; POWEROF2-NEXT: [[TMP13:%.*]] = fmul <2 x float> [[TMP12]], zeroinitializer
155157
; POWEROF2-NEXT: [[TMP14:%.*]] = call <2 x float> @llvm.vector.extract.v2f32.v4f32(<4 x float> [[TMP10]], i64 2)
156158
; POWEROF2-NEXT: [[TMP15:%.*]] = fmul <2 x float> zeroinitializer, [[TMP14]]
157-
; POWEROF2-NEXT: [[TMP25:%.*]] = call reassoc nsz float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP13]])
158-
; POWEROF2-NEXT: [[TMP27:%.*]] = call reassoc nsz float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP15]])
159+
; POWEROF2-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[TMP9]], i32 0
160+
; POWEROF2-NEXT: [[TMP17:%.*]] = fmul float 0.000000e+00, [[TMP18]]
161+
; POWEROF2-NEXT: [[TMP30:%.*]] = extractelement <2 x float> [[TMP9]], i32 1
162+
; POWEROF2-NEXT: [[TMP19:%.*]] = fmul float [[TMP30]], 0.000000e+00
163+
; POWEROF2-NEXT: [[TMP20:%.*]] = extractelement <2 x float> [[TMP13]], i32 0
164+
; POWEROF2-NEXT: [[TMP21:%.*]] = fadd reassoc nsz float [[TMP20]], [[TMP17]]
165+
; POWEROF2-NEXT: [[TMP22:%.*]] = extractelement <2 x float> [[TMP15]], i32 0
166+
; POWEROF2-NEXT: [[TMP23:%.*]] = fadd reassoc nsz float [[TMP22]], [[TMP19]]
167+
; POWEROF2-NEXT: [[TMP24:%.*]] = extractelement <2 x float> [[TMP13]], i32 1
168+
; POWEROF2-NEXT: [[TMP25:%.*]] = fadd reassoc nsz float [[TMP21]], [[TMP24]]
169+
; POWEROF2-NEXT: [[TMP26:%.*]] = extractelement <2 x float> [[TMP15]], i32 1
170+
; POWEROF2-NEXT: [[TMP27:%.*]] = fadd reassoc nsz float [[TMP23]], [[TMP26]]
159171
; POWEROF2-NEXT: [[TMP28:%.*]] = tail call float @llvm.sqrt.f32(float [[TMP25]])
160172
; POWEROF2-NEXT: [[TMP29:%.*]] = tail call float @llvm.sqrt.f32(float [[TMP27]])
161173
; POWEROF2-NEXT: ret ptr null

0 commit comments

Comments
 (0)