From 366897014e346a743fbcaebbff20c502e9796471 Mon Sep 17 00:00:00 2001 From: Samuel Tebbs Date: Wed, 9 Jul 2025 11:43:12 +0100 Subject: [PATCH] [LV] Remove common extends and selects in CSE This PR extends the LoopVectorizer's common sub-expression elimination (CSE) to remove identical extend and select instructions. Originally the work only included extends but that uncovered some duplicate selects in reduction-inloop.ll. --- llvm/lib/IR/Instruction.cpp | 2 + .../Transforms/Utils/FunctionComparator.cpp | 4 + .../Transforms/Vectorize/LoopVectorize.cpp | 3 +- .../AArch64/induction-costs-sve.ll | 19 +- ...outer_loop_test1_no_explicit_vect_width.ll | 3 +- .../LoopVectorize/ARM/mve-reductions.ll | 3 +- .../LoopVectorize/PowerPC/vectorize-bswap.ll | 5 +- .../RISCV/evl-compatible-loops.ll | 3 +- .../RISCV/riscv-vector-reverse-output.ll | 10 +- ...ize-force-tail-with-evl-cast-intrinsics.ll | 3 +- ...rize-force-tail-with-evl-cond-reduction.ll | 3 +- ...rize-force-tail-with-evl-gather-scatter.ll | 3 +- ...rize-force-tail-with-evl-reduction-cost.ll | 1 + ...-force-tail-with-evl-reverse-load-store.ll | 29 +- ...orize-force-tail-with-evl-uniform-store.ll | 3 +- ...outer_loop_test1_no_explicit_vect_width.ll | 6 +- llvm/test/Transforms/LoopVectorize/cse.ll | 321 ++++++++++++++++++ .../LoopVectorize/first-order-recurrence.ll | 14 +- .../LoopVectorize/float-induction.ll | 3 +- .../LoopVectorize/outer_loop_test1.ll | 4 +- .../LoopVectorize/reduction-inloop.ll | 3 +- 21 files changed, 371 insertions(+), 74 deletions(-) create mode 100644 llvm/test/Transforms/LoopVectorize/cse.ll diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp index cbf39b8adf1b2..c5ce547053052 100644 --- a/llvm/lib/IR/Instruction.cpp +++ b/llvm/lib/IR/Instruction.cpp @@ -920,6 +920,8 @@ bool Instruction::hasSameSpecialState(const Instruction *I2, if (const GetElementPtrInst *GEP = dyn_cast(I1)) return GEP->getSourceElementType() == cast(I2)->getSourceElementType(); + if (const CastInst *Cast = dyn_cast(I1)) + return Cast->getDestTy() == cast(I2)->getDestTy(); return true; } diff --git a/llvm/lib/Transforms/Utils/FunctionComparator.cpp b/llvm/lib/Transforms/Utils/FunctionComparator.cpp index 6d4026e8209de..8d1f7c9b3a2ea 100644 --- a/llvm/lib/Transforms/Utils/FunctionComparator.cpp +++ b/llvm/lib/Transforms/Utils/FunctionComparator.cpp @@ -804,6 +804,10 @@ int FunctionComparator::cmpOperations(const Instruction *L, return Res; } } + if (const CastInst *Cast = dyn_cast(L)) { + const CastInst *CastR = cast(R); + return cmpTypes(Cast->getDestTy(), CastR->getDestTy()); + } return 0; } diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 907839711a39c..d727f95844c04 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2643,7 +2643,8 @@ namespace { struct CSEDenseMapInfo { static bool canHandle(const Instruction *I) { return isa(I) || isa(I) || - isa(I) || isa(I); + isa(I) || isa(I) || + isa(I) || isa(I); } static inline Instruction *getEmptyKey() { diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll index dab14280a6b71..dad54b037a960 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll @@ -47,10 +47,8 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 { ; DEFAULT-NEXT: [[TMP27:%.*]] = zext [[WIDE_LOAD4]] to ; DEFAULT-NEXT: [[TMP28:%.*]] = mul [[TMP26]], [[TMP13]] ; DEFAULT-NEXT: [[TMP29:%.*]] = mul [[TMP27]], [[TMP13]] -; DEFAULT-NEXT: [[TMP30:%.*]] = zext [[WIDE_LOAD]] to -; DEFAULT-NEXT: [[TMP31:%.*]] = zext [[WIDE_LOAD4]] to -; DEFAULT-NEXT: [[TMP32:%.*]] = or [[TMP28]], [[TMP30]] -; DEFAULT-NEXT: [[TMP33:%.*]] = or [[TMP29]], [[TMP31]] +; DEFAULT-NEXT: [[TMP32:%.*]] = or [[TMP28]], [[TMP26]] +; DEFAULT-NEXT: [[TMP33:%.*]] = or [[TMP29]], [[TMP27]] ; DEFAULT-NEXT: [[TMP34:%.*]] = lshr [[TMP32]], splat (i16 1) ; DEFAULT-NEXT: [[TMP35:%.*]] = lshr [[TMP33]], splat (i16 1) ; DEFAULT-NEXT: [[TMP36:%.*]] = trunc [[TMP34]] to @@ -129,8 +127,7 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 { ; PRED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP19]], i32 1, [[ACTIVE_LANE_MASK]], poison) ; PRED-NEXT: [[TMP17:%.*]] = zext [[WIDE_MASKED_LOAD]] to ; PRED-NEXT: [[TMP22:%.*]] = mul [[TMP17]], [[TMP16]] -; PRED-NEXT: [[TMP24:%.*]] = zext [[WIDE_MASKED_LOAD]] to -; PRED-NEXT: [[TMP20:%.*]] = or [[TMP22]], [[TMP24]] +; PRED-NEXT: [[TMP20:%.*]] = or [[TMP22]], [[TMP17]] ; PRED-NEXT: [[TMP21:%.*]] = lshr [[TMP20]], splat (i16 1) ; PRED-NEXT: [[TMP23:%.*]] = trunc [[TMP21]] to ; PRED-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]] @@ -385,9 +382,8 @@ define void @trunc_ivs_and_store(i32 %x, ptr %dst, i64 %N) #0 { ; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; DEFAULT-NEXT: [[OFFSET_IDX:%.*]] = trunc i64 [[INDEX]] to i32 ; DEFAULT-NEXT: [[TMP15:%.*]] = add i32 [[OFFSET_IDX]], 1 -; DEFAULT-NEXT: [[TMP16:%.*]] = trunc i64 [[INDEX]] to i32 -; DEFAULT-NEXT: [[TMP18:%.*]] = add i32 [[TMP16]], 1 -; DEFAULT-NEXT: [[TMP19:%.*]] = mul i32 [[MUL]], [[TMP16]] +; DEFAULT-NEXT: [[TMP18:%.*]] = add i32 [[OFFSET_IDX]], 1 +; DEFAULT-NEXT: [[TMP19:%.*]] = mul i32 [[MUL]], [[OFFSET_IDX]] ; DEFAULT-NEXT: [[TMP20:%.*]] = mul i32 [[MUL]], [[TMP18]] ; DEFAULT-NEXT: [[TMP21:%.*]] = zext i32 [[TMP19]] to i64 ; DEFAULT-NEXT: [[TMP22:%.*]] = zext i32 [[TMP20]] to i64 @@ -580,9 +576,8 @@ define void @ivs_trunc_and_ext(i32 %x, ptr %dst, i64 %N) #0 { ; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; DEFAULT-NEXT: [[OFFSET_IDX:%.*]] = trunc i64 [[INDEX]] to i32 ; DEFAULT-NEXT: [[TMP14:%.*]] = add i32 [[OFFSET_IDX]], 1 -; DEFAULT-NEXT: [[TMP15:%.*]] = trunc i64 [[INDEX]] to i32 -; DEFAULT-NEXT: [[TMP17:%.*]] = add i32 [[TMP15]], 1 -; DEFAULT-NEXT: [[TMP18:%.*]] = mul i32 [[ADD]], [[TMP15]] +; DEFAULT-NEXT: [[TMP17:%.*]] = add i32 [[OFFSET_IDX]], 1 +; DEFAULT-NEXT: [[TMP18:%.*]] = mul i32 [[ADD]], [[OFFSET_IDX]] ; DEFAULT-NEXT: [[TMP19:%.*]] = mul i32 [[ADD]], [[TMP17]] ; DEFAULT-NEXT: [[TMP20:%.*]] = zext i32 [[TMP18]] to i64 ; DEFAULT-NEXT: [[TMP21:%.*]] = zext i32 [[TMP19]] to i64 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_test1_no_explicit_vect_width.ll b/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_test1_no_explicit_vect_width.ll index 29795bc9f2982..1733e8d12edd9 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_test1_no_explicit_vect_width.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_test1_no_explicit_vect_width.ll @@ -27,8 +27,7 @@ ; CHECK: %[[AAddr:.*]] = getelementptr inbounds [8 x i32], ptr @arr2, i64 0, <4 x i64> %[[VecInd]] ; CHECK: %[[VecIndTr:.*]] = trunc <4 x i64> %[[VecInd]] to <4 x i32> ; CHECK: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %[[VecIndTr]], <4 x ptr> %[[AAddr]], i32 4, <4 x i1> splat (i1 true)) -; CHECK: %[[VecIndTr2:.*]] = trunc <4 x i64> %[[VecInd]] to <4 x i32> -; CHECK: %[[StoreVal:.*]] = add nsw <4 x i32> %[[VecIndTr2]], %[[Splat]] +; CHECK: %[[StoreVal:.*]] = add nsw <4 x i32> %[[VecIndTr]], %[[Splat]] ; CHECK: br label %[[InnerLoop:.+]] ; CHECK: [[InnerLoop]]: diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll index ddd334d2982f8..d047736b13993 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll @@ -1679,8 +1679,7 @@ define i64 @test_std_q31(ptr %x, i32 %n) #0 { ; CHECK-NEXT: [[TMP2:%.*]] = sext <4 x i32> [[TMP1]] to <4 x i64> ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP2]]) ; CHECK-NEXT: [[TMP4]] = add i64 [[TMP3]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP5:%.*]] = sext <4 x i32> [[TMP1]] to <4 x i64> -; CHECK-NEXT: [[TMP6:%.*]] = mul nsw <4 x i64> [[TMP5]], [[TMP5]] +; CHECK-NEXT: [[TMP6:%.*]] = mul nsw <4 x i64> [[TMP2]], [[TMP2]] ; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP6]]) ; CHECK-NEXT: [[TMP8]] = add i64 [[TMP7]], [[VEC_PHI1]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-bswap.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-bswap.ll index a515b10bb7d62..70b2a71926090 100644 --- a/llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-bswap.ll +++ b/llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-bswap.ll @@ -20,10 +20,7 @@ define dso_local void @test(ptr %Arr, i32 signext %Len) { ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> [[WIDE_LOAD]]) -; CHECK-NEXT: [[TMP5:%.*]] = sext i32 [[INDEX]] to i64 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0 -; CHECK-NEXT: store <4 x i32> [[TMP4]], ptr [[TMP7]], align 4 +; CHECK-NEXT: store <4 x i32> [[TMP4]], ptr [[TMP3]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/evl-compatible-loops.ll b/llvm/test/Transforms/LoopVectorize/RISCV/evl-compatible-loops.ll index 25f52b2a99ddc..ea4a436378a4a 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/evl-compatible-loops.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/evl-compatible-loops.ll @@ -35,8 +35,7 @@ define void @test_wide_integer_induction(ptr noalias %a, i64 %N) { ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[EVL_BASED_IV]] ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[TMP14]], i32 0 ; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[VEC_IND]], ptr align 8 [[TMP15]], splat (i1 true), i32 [[TMP11]]) -; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP11]] to i64 -; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]] +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP12]], [[EVL_BASED_IV]] ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], [[TMP8]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll index 29b27cdb7556d..8d0ff03dad3bb 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll @@ -108,9 +108,8 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) { ; RV32-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4i32( [[WIDE_LOAD]]) ; RV32-NEXT: [[TMP15:%.*]] = add [[REVERSE]], splat (i32 1) ; RV32-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]] -; RV32-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP5]] to i32 -; RV32-NEXT: [[TMP18:%.*]] = mul i32 0, [[TMP17]] -; RV32-NEXT: [[TMP25:%.*]] = sub i32 [[TMP17]], 1 +; RV32-NEXT: [[TMP18:%.*]] = mul i32 0, [[TMP10]] +; RV32-NEXT: [[TMP25:%.*]] = sub i32 [[TMP10]], 1 ; RV32-NEXT: [[TMP19:%.*]] = mul i32 -1, [[TMP25]] ; RV32-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 [[TMP18]] ; RV32-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i32 [[TMP19]] @@ -322,9 +321,8 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) { ; RV32-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4f32( [[WIDE_LOAD]]) ; RV32-NEXT: [[TMP15:%.*]] = fadd [[REVERSE]], splat (float 1.000000e+00) ; RV32-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]] -; RV32-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP5]] to i32 -; RV32-NEXT: [[TMP18:%.*]] = mul i32 0, [[TMP17]] -; RV32-NEXT: [[TMP25:%.*]] = sub i32 [[TMP17]], 1 +; RV32-NEXT: [[TMP18:%.*]] = mul i32 0, [[TMP10]] +; RV32-NEXT: [[TMP25:%.*]] = sub i32 [[TMP10]], 1 ; RV32-NEXT: [[TMP19:%.*]] = mul i32 -1, [[TMP25]] ; RV32-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i32 [[TMP18]] ; RV32-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 [[TMP19]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cast-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cast-intrinsics.ll index 091eb8720260b..2c35ed9a6f4ae 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cast-intrinsics.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cast-intrinsics.ll @@ -982,8 +982,7 @@ define void @vp_ptrtoint(ptr %a, ptr %b, i64 %N) { ; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[TMP16]], i32 0 ; IF-EVL-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[TMP15]], ptr align 8 [[TMP17]], splat (i1 true), i32 [[TMP11]]) -; IF-EVL-NEXT: [[TMP18:%.*]] = zext i32 [[TMP11]] to i64 -; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP18]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP12]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] ; IF-EVL-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT]] ; IF-EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll index 8c44da63e08a6..55ac4cca308bd 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll @@ -827,8 +827,7 @@ define i32 @step_cond_add_pred(ptr %a, i64 %n, i32 %start) { ; IF-EVL-OUTLOOP-NEXT: [[TMP23:%.*]] = select [[TMP18]], [[TMP21]], zeroinitializer ; IF-EVL-OUTLOOP-NEXT: [[PREDPHI:%.*]] = select [[TMP23]], [[VEC_PHI]], [[TMP22]] ; IF-EVL-OUTLOOP-NEXT: [[TMP24]] = call @llvm.vp.merge.nxv4i32( splat (i1 true), [[PREDPHI]], [[VEC_PHI]], i32 [[TMP14]]) -; IF-EVL-OUTLOOP-NEXT: [[TMP25:%.*]] = zext i32 [[TMP14]] to i64 -; IF-EVL-OUTLOOP-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP25]], [[IV]] +; IF-EVL-OUTLOOP-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[IV]] ; IF-EVL-OUTLOOP-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] ; IF-EVL-OUTLOOP-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT6]] ; IF-EVL-OUTLOOP-NEXT: [[VEC_IND_NEXT7]] = add [[VEC_IND2]], [[BROADCAST_SPLAT4]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-gather-scatter.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-gather-scatter.ll index 2b7a9fbdfd801..b494d7138b95b 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-gather-scatter.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-gather-scatter.ll @@ -42,8 +42,7 @@ define void @gather_scatter(ptr noalias %in, ptr noalias %out, ptr noalias %inde ; IF-EVL-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call @llvm.vp.gather.nxv2f32.nxv2p0( align 4 [[TMP15]], splat (i1 true), i32 [[TMP11]]) ; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[OUT:%.*]], [[WIDE_MASKED_GATHER]] ; IF-EVL-NEXT: call void @llvm.vp.scatter.nxv2f32.nxv2p0( [[WIDE_MASKED_GATHER2]], align 4 [[TMP16]], splat (i1 true), i32 [[TMP11]]) -; IF-EVL-NEXT: [[TMP17:%.*]] = zext i32 [[TMP11]] to i64 -; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP17]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP12]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], [[TMP8]] ; IF-EVL-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT]] ; IF-EVL-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N_VEC]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction-cost.ll index 15f99931d165f..f794bef3c794e 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction-cost.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; REQUIRES: asserts ; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize --disable-output \ ; RUN: -force-tail-folding-style=data-with-evl \ diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll index 91d94e52d0990..5b0738f268bf3 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll @@ -40,16 +40,14 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt ; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], splat (i1 true), i32 [[TMP5]]) ; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_OP_LOAD]], splat (i1 true), i32 [[TMP5]]) ; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP7]] -; IF-EVL-NEXT: [[TMP19:%.*]] = zext i32 [[TMP5]] to i64 -; IF-EVL-NEXT: [[TMP14:%.*]] = mul i64 0, [[TMP19]] -; IF-EVL-NEXT: [[TMP23:%.*]] = sub i64 [[TMP19]], 1 +; IF-EVL-NEXT: [[TMP14:%.*]] = mul i64 0, [[TMP18]] +; IF-EVL-NEXT: [[TMP23:%.*]] = sub i64 [[TMP18]], 1 ; IF-EVL-NEXT: [[TMP15:%.*]] = mul i64 -1, [[TMP23]] ; IF-EVL-NEXT: [[TMP22:%.*]] = getelementptr i32, ptr [[TMP13]], i64 [[TMP14]] ; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[TMP22]], i64 [[TMP15]] ; IF-EVL-NEXT: [[VP_REVERSE3:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_REVERSE]], splat (i1 true), i32 [[TMP5]]) ; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_REVERSE3]], ptr align 4 [[TMP17]], splat (i1 true), i32 [[TMP5]]) -; IF-EVL-NEXT: [[TMP20:%.*]] = zext i32 [[TMP5]] to i64 -; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP20]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP18]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] ; IF-EVL-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] @@ -146,17 +144,15 @@ define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noal ; IF-EVL-NEXT: [[VP_OP_LOAD4:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP20]], [[VP_REVERSE_MASK]], i32 [[TMP5]]) ; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_OP_LOAD4]], splat (i1 true), i32 [[TMP5]]) ; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr [[PTR2:%.*]], i64 [[TMP11]] -; IF-EVL-NEXT: [[TMP27:%.*]] = zext i32 [[TMP5]] to i64 -; IF-EVL-NEXT: [[TMP22:%.*]] = mul i64 0, [[TMP27]] -; IF-EVL-NEXT: [[TMP30:%.*]] = sub i64 [[TMP27]], 1 +; IF-EVL-NEXT: [[TMP22:%.*]] = mul i64 0, [[TMP26]] +; IF-EVL-NEXT: [[TMP30:%.*]] = sub i64 [[TMP26]], 1 ; IF-EVL-NEXT: [[TMP23:%.*]] = mul i64 -1, [[TMP30]] ; IF-EVL-NEXT: [[TMP24:%.*]] = getelementptr i32, ptr [[TMP21]], i64 [[TMP22]] ; IF-EVL-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr [[TMP24]], i64 [[TMP23]] ; IF-EVL-NEXT: [[VP_REVERSE5:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_REVERSE]], splat (i1 true), i32 [[TMP5]]) ; IF-EVL-NEXT: [[VP_REVERSE_MASK6:%.*]] = call @llvm.experimental.vp.reverse.nxv4i1( [[TMP14]], splat (i1 true), i32 [[TMP5]]) ; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_REVERSE5]], ptr align 4 [[TMP25]], [[VP_REVERSE_MASK6]], i32 [[TMP5]]) -; IF-EVL-NEXT: [[TMP28:%.*]] = zext i32 [[TMP5]] to i64 -; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP28]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP26]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] ; IF-EVL-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] @@ -274,25 +270,22 @@ define void @multiple_reverse_vector_pointer(ptr noalias %a, ptr noalias %b, ptr ; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[B:%.*]], [[VP_REVERSE]] ; IF-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.vp.gather.nxv16i8.nxv16p0( align 1 [[TMP14]], splat (i1 true), i32 [[TMP6]]) ; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 [[OFFSET_IDX]] -; IF-EVL-NEXT: [[TMP16:%.*]] = zext i32 [[TMP6]] to i64 -; IF-EVL-NEXT: [[TMP17:%.*]] = mul i64 0, [[TMP16]] -; IF-EVL-NEXT: [[TMP30:%.*]] = sub i64 [[TMP16]], 1 +; IF-EVL-NEXT: [[TMP17:%.*]] = mul i64 0, [[TMP9]] +; IF-EVL-NEXT: [[TMP30:%.*]] = sub i64 [[TMP9]], 1 ; IF-EVL-NEXT: [[TMP18:%.*]] = mul i64 -1, [[TMP30]] ; IF-EVL-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[TMP15]], i64 [[TMP17]] ; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[TMP19]], i64 [[TMP18]] ; IF-EVL-NEXT: [[VP_REVERSE1:%.*]] = call @llvm.experimental.vp.reverse.nxv16i8( [[WIDE_MASKED_GATHER]], splat (i1 true), i32 [[TMP6]]) ; IF-EVL-NEXT: call void @llvm.vp.store.nxv16i8.p0( [[VP_REVERSE1]], ptr align 1 [[TMP20]], splat (i1 true), i32 [[TMP6]]) ; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[D:%.*]], i64 [[OFFSET_IDX]] -; IF-EVL-NEXT: [[TMP22:%.*]] = zext i32 [[TMP6]] to i64 -; IF-EVL-NEXT: [[TMP23:%.*]] = mul i64 0, [[TMP22]] -; IF-EVL-NEXT: [[TMP31:%.*]] = sub i64 [[TMP22]], 1 +; IF-EVL-NEXT: [[TMP23:%.*]] = mul i64 0, [[TMP9]] +; IF-EVL-NEXT: [[TMP31:%.*]] = sub i64 [[TMP9]], 1 ; IF-EVL-NEXT: [[TMP24:%.*]] = mul i64 -1, [[TMP31]] ; IF-EVL-NEXT: [[TMP25:%.*]] = getelementptr i8, ptr [[TMP21]], i64 [[TMP23]] ; IF-EVL-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[TMP25]], i64 [[TMP24]] ; IF-EVL-NEXT: [[VP_REVERSE2:%.*]] = call @llvm.experimental.vp.reverse.nxv16i8( [[WIDE_MASKED_GATHER]], splat (i1 true), i32 [[TMP6]]) ; IF-EVL-NEXT: call void @llvm.vp.store.nxv16i8.p0( [[VP_REVERSE2]], ptr align 1 [[TMP26]], splat (i1 true), i32 [[TMP6]]) -; IF-EVL-NEXT: [[TMP27:%.*]] = zext i32 [[TMP6]] to i64 -; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP27]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP9]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] ; IF-EVL-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll index 984b64c55ce16..8469b22d7739b 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll @@ -40,8 +40,7 @@ define void @lshift_significand(i32 %n, ptr nocapture writeonly %dst) { ; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i64, ptr [[TMP18]], i64 [[TMP17]] ; CHECK-NEXT: [[VP_REVERSE:%.*]] = call @llvm.experimental.vp.reverse.nxv2i64( zeroinitializer, splat (i1 true), i32 [[TMP11]]) ; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[VP_REVERSE]], ptr align 8 [[TMP19]], splat (i1 true), i32 [[TMP11]]) -; CHECK-NEXT: [[TMP20:%.*]] = zext i32 [[TMP11]] to i64 -; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP20]], [[EVL_BASED_IV]] +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP15]], [[EVL_BASED_IV]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]] ; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll b/llvm/test/Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll index 02d48cbda1aab..c80962b3cc9cc 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll @@ -28,8 +28,7 @@ ; CHECK: %[[AAddr:.*]] = getelementptr inbounds [8 x i32], ptr @arr2, i64 0, <4 x i64> %[[VecInd]] ; CHECK: %[[VecIndTr:.*]] = trunc <4 x i64> %[[VecInd]] to <4 x i32> ; CHECK: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %[[VecIndTr]], <4 x ptr> %[[AAddr]], i32 4, <4 x i1> splat (i1 true)) -; CHECK: %[[VecIndTr2:.*]] = trunc <4 x i64> %[[VecInd]] to <4 x i32> -; CHECK: %[[StoreVal:.*]] = add nsw <4 x i32> %[[VecIndTr2]], %[[Splat]] +; CHECK: %[[StoreVal:.*]] = add nsw <4 x i32> %[[VecIndTr]], %[[Splat]] ; CHECK: br label %[[InnerLoop:.+]] ; CHECK: [[InnerLoop]]: @@ -57,8 +56,7 @@ ; AVX: %[[AAddr:.*]] = getelementptr inbounds [8 x i32], ptr @arr2, i64 0, <8 x i64> %[[VecInd]] ; AVX: %[[VecIndTr:.*]] = trunc <8 x i64> %[[VecInd]] to <8 x i32> ; AVX: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %[[VecIndTr]], <8 x ptr> %[[AAddr]], i32 4, <8 x i1> splat (i1 true)) -; AVX: %[[VecIndTr2:.*]] = trunc <8 x i64> %[[VecInd]] to <8 x i32> -; AVX: %[[StoreVal:.*]] = add nsw <8 x i32> %[[VecIndTr2]], %[[Splat]] +; AVX: %[[StoreVal:.*]] = add nsw <8 x i32> %[[VecIndTr]], %[[Splat]] ; AVX: br label %[[InnerLoop:.+]] ; AVX: [[InnerLoop]]: diff --git a/llvm/test/Transforms/LoopVectorize/cse.ll b/llvm/test/Transforms/LoopVectorize/cse.ll new file mode 100644 index 0000000000000..ce5a4bb8b1e51 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/cse.ll @@ -0,0 +1,321 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "^scalar.ph:" --version 5 +; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -enable-epilogue-vectorization=false -S < %s | FileCheck %s + +define i32 @common_sext(ptr %a, ptr %b, ptr %c, i32 %N) { +; CHECK-LABEL: define i32 @common_sext( +; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP5]], align 1 +; CHECK-NEXT: [[TMP6:%.*]] = sext <4 x i8> [[WIDE_LOAD]] to <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = sext <4 x i8> [[WIDE_LOAD1]] to <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = sext <4 x i8> [[WIDE_LOAD2]] to <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = add nsw <4 x i32> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[TMP8]], [[TMP7]] +; CHECK-NEXT: [[TMP11:%.*]] = add <4 x i32> [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i32> [[TMP11]], i32 3 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP11]], i32 3 +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; +entry: + %cmp28.not = icmp ult i32 %N, 2 + %div27 = lshr i32 %N, 1 + %wide.trip.count = zext nneg i32 %div27 to i64 + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %res = phi i32 [ 0, %entry ], [ %add3, %for.body ] + %a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %iv + %b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %iv + %c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %iv + %a.val = load i8, ptr %a.ptr, align 1 + %b.val = load i8, ptr %b.ptr, align 1 + %c.val = load i8, ptr %c.ptr, align 1 + %a.ext = sext i8 %a.val to i32 + %b.ext = sext i8 %b.val to i32 + %b.ext2 = sext i8 %b.val to i32 + %c.ext = sext i8 %c.val to i32 + %add = add nsw i32 %a.ext, %b.ext + %add2 = add nsw i32 %c.ext, %b.ext2 + %add3 = add i32 %add, %add2 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, %wide.trip.count + br i1 %exitcond, label %exit, label %for.body + +exit: ; preds = %exit.loopexit, %entry + %res.0.lcssa = phi i32 [ %add3, %for.body ] + ret i32 %res.0.lcssa +} + +define i32 @common_zext(ptr %a, ptr %b, ptr %c, i32 %N) { +; CHECK-LABEL: define i32 @common_zext( +; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP11]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP12]], align 1 +; CHECK-NEXT: [[TMP4:%.*]] = zext <4 x i8> [[WIDE_LOAD]] to <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[WIDE_LOAD1]] to <4 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = zext <4 x i8> [[WIDE_LOAD2]] to <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> [[TMP13]], [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i32> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[TMP8]], i32 3 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP8]], i32 3 +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; +entry: + %cmp28.not = icmp ult i32 %N, 2 + %div27 = lshr i32 %N, 1 + %wide.trip.count = zext nneg i32 %div27 to i64 + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %res = phi i32 [ 0, %entry ], [ %add3, %for.body ] + %a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %iv + %b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %iv + %c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %iv + %a.val = load i8, ptr %a.ptr, align 1 + %b.val = load i8, ptr %b.ptr, align 1 + %c.val = load i8, ptr %c.ptr, align 1 + %a.ext = zext i8 %a.val to i32 + %b.ext = zext i8 %b.val to i32 + %b.ext2 = zext i8 %b.val to i32 + %c.ext = zext i8 %c.val to i32 + %add = add nsw i32 %a.ext, %b.ext + %add2 = add nsw i32 %c.ext, %b.ext2 + %add3 = add i32 %add, %add2 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, %wide.trip.count + br i1 %exitcond, label %exit, label %for.body + +exit: ; preds = %exit.loopexit, %entry + %res.0.lcssa = phi i32 [ %add3, %for.body ] + ret i32 %res.0.lcssa +} + +define i32 @common_sext_different_types(ptr %a, ptr %b, ptr %c, i32 %N) { +; CHECK-LABEL: define i32 @common_sext_different_types( +; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP4]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP10]], align 1 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP11]], align 1 +; CHECK-NEXT: [[TMP12:%.*]] = load i16, ptr [[TMP5]], align 1 +; CHECK-NEXT: [[TMP13:%.*]] = load i16, ptr [[TMP6]], align 1 +; CHECK-NEXT: [[TMP14:%.*]] = load i16, ptr [[TMP7]], align 1 +; CHECK-NEXT: [[TMP15:%.*]] = load i16, ptr [[TMP8]], align 1 +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x i16> poison, i16 [[TMP12]], i32 0 +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x i16> [[TMP16]], i16 [[TMP13]], i32 1 +; CHECK-NEXT: [[TMP18:%.*]] = insertelement <4 x i16> [[TMP17]], i16 [[TMP14]], i32 2 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i16> [[TMP18]], i16 [[TMP15]], i32 3 +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP20]], align 1 +; CHECK-NEXT: [[TMP21:%.*]] = sext <4 x i8> [[WIDE_LOAD]] to <4 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = sext <4 x i8> [[WIDE_LOAD1]] to <4 x i32> +; CHECK-NEXT: [[TMP23:%.*]] = sext <4 x i16> [[TMP19]] to <4 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = sext <4 x i8> [[WIDE_LOAD2]] to <4 x i32> +; CHECK-NEXT: [[TMP25:%.*]] = add nsw <4 x i32> [[TMP21]], [[TMP22]] +; CHECK-NEXT: [[TMP26:%.*]] = add nsw <4 x i32> [[TMP24]], [[TMP23]] +; CHECK-NEXT: [[TMP27:%.*]] = add <4 x i32> [[TMP25]], [[TMP26]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP29:%.*]] = extractelement <4 x i32> [[TMP27]], i32 3 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP27]], i32 3 +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; +entry: + %cmp28.not = icmp ult i32 %N, 2 + %div27 = lshr i32 %N, 1 + %wide.trip.count = zext nneg i32 %div27 to i64 + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %res = phi i32 [ 0, %entry ], [ %add3, %for.body ] + %a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %iv + %b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %iv + %b.ptr2 = getelementptr inbounds nuw i16, ptr %b, i64 %iv + %c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %iv + %a.val = load i8, ptr %a.ptr, align 1 + %b.val = load i8, ptr %b.ptr, align 1 + %b.val2 = load i16, ptr %b.ptr, align 1 + %c.val = load i8, ptr %c.ptr, align 1 + %a.ext = sext i8 %a.val to i32 + %b.ext = sext i8 %b.val to i32 + %b.ext2 = sext i16 %b.val2 to i32 + %c.ext = sext i8 %c.val to i32 + %add = add nsw i32 %a.ext, %b.ext + %add2 = add nsw i32 %c.ext, %b.ext2 + %add3 = add i32 %add, %add2 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, %wide.trip.count + br i1 %exitcond, label %exit, label %for.body + +exit: ; preds = %exit.loopexit, %entry + %res.0.lcssa = phi i32 [ %add3, %for.body ] + ret i32 %res.0.lcssa +} +define i32 @common_zext_different_types(ptr %a, ptr %b, ptr %c, i32 %N) { +; CHECK-LABEL: define i32 @common_zext_different_types( +; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP4]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP10]], align 1 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP11]], align 1 +; CHECK-NEXT: [[TMP12:%.*]] = load i16, ptr [[TMP5]], align 1 +; CHECK-NEXT: [[TMP13:%.*]] = load i16, ptr [[TMP6]], align 1 +; CHECK-NEXT: [[TMP14:%.*]] = load i16, ptr [[TMP7]], align 1 +; CHECK-NEXT: [[TMP15:%.*]] = load i16, ptr [[TMP8]], align 1 +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x i16> poison, i16 [[TMP12]], i32 0 +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x i16> [[TMP16]], i16 [[TMP13]], i32 1 +; CHECK-NEXT: [[TMP18:%.*]] = insertelement <4 x i16> [[TMP17]], i16 [[TMP14]], i32 2 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i16> [[TMP18]], i16 [[TMP15]], i32 3 +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP20]], align 1 +; CHECK-NEXT: [[TMP21:%.*]] = zext <4 x i8> [[WIDE_LOAD]] to <4 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = zext <4 x i8> [[WIDE_LOAD1]] to <4 x i32> +; CHECK-NEXT: [[TMP23:%.*]] = zext <4 x i16> [[TMP19]] to <4 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = zext <4 x i8> [[WIDE_LOAD2]] to <4 x i32> +; CHECK-NEXT: [[TMP25:%.*]] = add nsw <4 x i32> [[TMP21]], [[TMP22]] +; CHECK-NEXT: [[TMP26:%.*]] = add nsw <4 x i32> [[TMP24]], [[TMP23]] +; CHECK-NEXT: [[TMP27:%.*]] = add <4 x i32> [[TMP25]], [[TMP26]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP29:%.*]] = extractelement <4 x i32> [[TMP27]], i32 3 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP27]], i32 3 +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; +entry: + %cmp28.not = icmp ult i32 %N, 2 + %div27 = lshr i32 %N, 1 + %wide.trip.count = zext nneg i32 %div27 to i64 + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %res = phi i32 [ 0, %entry ], [ %add3, %for.body ] + %a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %iv + %b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %iv + %b.ptr2 = getelementptr inbounds nuw i16, ptr %b, i64 %iv + %c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %iv + %a.val = load i8, ptr %a.ptr, align 1 + %b.val = load i8, ptr %b.ptr, align 1 + %b.val2 = load i16, ptr %b.ptr, align 1 + %c.val = load i8, ptr %c.ptr, align 1 + %a.ext = zext i8 %a.val to i32 + %b.ext = zext i8 %b.val to i32 + %b.ext2 = zext i16 %b.val2 to i32 + %c.ext = zext i8 %c.val to i32 + %add = add nsw i32 %a.ext, %b.ext + %add2 = add nsw i32 %c.ext, %b.ext2 + %add3 = add i32 %add, %add2 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, %wide.trip.count + br i1 %exitcond, label %exit, label %for.body + +exit: ; preds = %exit.loopexit, %entry + %res.0.lcssa = phi i32 [ %add3, %for.body ] + ret i32 %res.0.lcssa +} diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll index 17c2be64f1a31..280200e850e45 100644 --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll @@ -571,13 +571,12 @@ define void @recurrence_3(ptr readonly noalias %a, ptr noalias %b, i32 %n, float ; UNROLL-NO-VF-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[TMP4]] ; UNROLL-NO-VF-NEXT: [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 2 ; UNROLL-NO-VF-NEXT: [[TMP8]] = load i16, ptr [[TMP6]], align 2 -; UNROLL-NO-VF-NEXT: [[TMP9:%.*]] = sitofp i16 [[TMP7]] to double +; UNROLL-NO-VF-NEXT: [[TMP12:%.*]] = sitofp i16 [[TMP7]] to double ; UNROLL-NO-VF-NEXT: [[TMP10:%.*]] = sitofp i16 [[TMP8]] to double ; UNROLL-NO-VF-NEXT: [[TMP11:%.*]] = sitofp i16 [[VECTOR_RECUR]] to double -; UNROLL-NO-VF-NEXT: [[TMP12:%.*]] = sitofp i16 [[TMP7]] to double ; UNROLL-NO-VF-NEXT: [[TMP13:%.*]] = fmul fast double [[TMP11]], [[CONV1]] ; UNROLL-NO-VF-NEXT: [[TMP14:%.*]] = fmul fast double [[TMP12]], [[CONV1]] -; UNROLL-NO-VF-NEXT: [[TMP15:%.*]] = fsub fast double [[TMP9]], [[TMP13]] +; UNROLL-NO-VF-NEXT: [[TMP15:%.*]] = fsub fast double [[TMP12]], [[TMP13]] ; UNROLL-NO-VF-NEXT: [[TMP16:%.*]] = fsub fast double [[TMP10]], [[TMP14]] ; UNROLL-NO-VF-NEXT: [[TMP17:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[OFFSET_IDX]] ; UNROLL-NO-VF-NEXT: [[TMP18:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[TMP4]] @@ -1768,11 +1767,10 @@ define void @sink_after(ptr noalias %a, ptr noalias %b, i64 %n) { ; UNROLL-NO-VF-NEXT: [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 2 ; UNROLL-NO-VF-NEXT: [[TMP7]] = load i16, ptr [[TMP5]], align 2 ; UNROLL-NO-VF-NEXT: [[TMP8:%.*]] = sext i16 [[VECTOR_RECUR]] to i32 -; UNROLL-NO-VF-NEXT: [[TMP9:%.*]] = sext i16 [[TMP6]] to i32 ; UNROLL-NO-VF-NEXT: [[TMP10:%.*]] = sext i16 [[TMP6]] to i32 ; UNROLL-NO-VF-NEXT: [[TMP11:%.*]] = sext i16 [[TMP7]] to i32 ; UNROLL-NO-VF-NEXT: [[TMP12:%.*]] = mul nsw i32 [[TMP10]], [[TMP8]] -; UNROLL-NO-VF-NEXT: [[TMP13:%.*]] = mul nsw i32 [[TMP11]], [[TMP9]] +; UNROLL-NO-VF-NEXT: [[TMP13:%.*]] = mul nsw i32 [[TMP11]], [[TMP10]] ; UNROLL-NO-VF-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]] ; UNROLL-NO-VF-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP1]] ; UNROLL-NO-VF-NEXT: store i32 [[TMP12]], ptr [[TMP14]], align 4 @@ -2006,11 +2004,10 @@ define void @PR34711(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %n) { ; UNROLL-NO-VF-NEXT: [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 2 ; UNROLL-NO-VF-NEXT: [[TMP7]] = load i16, ptr [[TMP5]], align 2 ; UNROLL-NO-VF-NEXT: [[TMP8:%.*]] = sext i16 [[VECTOR_RECUR]] to i32 -; UNROLL-NO-VF-NEXT: [[TMP9:%.*]] = sext i16 [[TMP6]] to i32 ; UNROLL-NO-VF-NEXT: [[TMP10:%.*]] = sext i16 [[TMP6]] to i32 ; UNROLL-NO-VF-NEXT: [[TMP11:%.*]] = sext i16 [[TMP7]] to i32 ; UNROLL-NO-VF-NEXT: [[TMP12:%.*]] = mul nsw i32 [[TMP10]], [[TMP8]] -; UNROLL-NO-VF-NEXT: [[TMP13:%.*]] = mul nsw i32 [[TMP11]], [[TMP9]] +; UNROLL-NO-VF-NEXT: [[TMP13:%.*]] = mul nsw i32 [[TMP11]], [[TMP10]] ; UNROLL-NO-VF-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]] ; UNROLL-NO-VF-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP1]] ; UNROLL-NO-VF-NEXT: store i32 [[TMP12]], ptr [[TMP14]], align 4 @@ -2228,9 +2225,8 @@ define void @sink_after_with_multiple_users(ptr noalias %a, ptr noalias %b, i64 ; UNROLL-NO-VF-NEXT: [[TMP9:%.*]] = sext i16 [[TMP6]] to i32 ; UNROLL-NO-VF-NEXT: [[TMP10:%.*]] = add nsw i32 [[TMP8]], 2 ; UNROLL-NO-VF-NEXT: [[TMP11:%.*]] = add nsw i32 [[TMP9]], 2 -; UNROLL-NO-VF-NEXT: [[TMP12:%.*]] = sext i16 [[TMP6]] to i32 ; UNROLL-NO-VF-NEXT: [[TMP13:%.*]] = sext i16 [[TMP7]] to i32 -; UNROLL-NO-VF-NEXT: [[TMP14:%.*]] = mul nsw i32 [[TMP10]], [[TMP12]] +; UNROLL-NO-VF-NEXT: [[TMP14:%.*]] = mul nsw i32 [[TMP10]], [[TMP9]] ; UNROLL-NO-VF-NEXT: [[TMP15:%.*]] = mul nsw i32 [[TMP11]], [[TMP13]] ; UNROLL-NO-VF-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]] ; UNROLL-NO-VF-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP1]] diff --git a/llvm/test/Transforms/LoopVectorize/float-induction.ll b/llvm/test/Transforms/LoopVectorize/float-induction.ll index fb6e6be7ab22f..d3a5963bb5877 100644 --- a/llvm/test/Transforms/LoopVectorize/float-induction.ll +++ b/llvm/test/Transforms/LoopVectorize/float-induction.ll @@ -932,8 +932,7 @@ define void @fp_iv_loop3(float %init, ptr noalias nocapture %A, ptr noalias noca ; VEC1_INTERL2-NEXT: [[TMP4:%.*]] = or disjoint i64 [[INDEX]], 1 ; VEC1_INTERL2-NEXT: [[DOTCAST5:%.*]] = sitofp i64 [[INDEX]] to float ; VEC1_INTERL2-NEXT: [[TMP5:%.*]] = fmul fast float [[DOTCAST5]], -5.000000e-01 -; VEC1_INTERL2-NEXT: [[DOTCAST6:%.*]] = sitofp i64 [[INDEX]] to float -; VEC1_INTERL2-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP0]], [[DOTCAST6]] +; VEC1_INTERL2-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP0]], [[DOTCAST5]] ; VEC1_INTERL2-NEXT: [[OFFSET_IDX7:%.*]] = fadd fast float [[INIT]], [[TMP6]] ; VEC1_INTERL2-NEXT: [[TMP7:%.*]] = fadd fast float [[OFFSET_IDX7]], [[TMP0]] ; VEC1_INTERL2-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] diff --git a/llvm/test/Transforms/LoopVectorize/outer_loop_test1.ll b/llvm/test/Transforms/LoopVectorize/outer_loop_test1.ll index 80e7de71870b7..8ec90f552ddab 100644 --- a/llvm/test/Transforms/LoopVectorize/outer_loop_test1.ll +++ b/llvm/test/Transforms/LoopVectorize/outer_loop_test1.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; extern int arr[8][8]; ; extern int arr2[8]; ; @@ -24,8 +25,7 @@ ; CHECK: %[[AAddr:.*]] = getelementptr inbounds [8 x i32], ptr @arr2, i64 0, <4 x i64> %[[VecInd]] ; CHECK: %[[VecIndTr:.*]] = trunc <4 x i64> %[[VecInd]] to <4 x i32> ; CHECK: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %[[VecIndTr]], <4 x ptr> %[[AAddr]], i32 4, <4 x i1> splat (i1 true)) -; CHECK: %[[VecIndTr2:.*]] = trunc <4 x i64> %[[VecInd]] to <4 x i32> -; CHECK: %[[StoreVal:.*]] = add nsw <4 x i32> %[[VecIndTr2]], %[[Splat]] +; CHECK: %[[StoreVal:.*]] = add nsw <4 x i32> %[[VecIndTr]], %[[Splat]] ; CHECK: br label %[[InnerLoop:.+]] ; CHECK: [[InnerLoop]]: diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll index e762c9ff81322..cc479f7044ae1 100644 --- a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll @@ -1295,8 +1295,7 @@ define i32 @predicated_not_dominates_reduction_twoadd(ptr nocapture noundef read ; CHECK-NEXT: [[TMP5:%.*]] = zext nneg <4 x i8> [[TMP4]] to <4 x i32> ; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]]) ; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP7]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP9:%.*]] = zext nneg <4 x i8> [[TMP4]] to <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP9]]) +; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]]) ; CHECK-NEXT: [[TMP11]] = add i32 [[TMP10]], [[TMP8]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]