Skip to content

Commit 3668970

Browse files
committed
[LV] Remove common extends and selects in CSE
This PR extends the LoopVectorizer's common sub-expression elimination (CSE) to remove identical extend and select instructions. Originally the work only included extends but that uncovered some duplicate selects in reduction-inloop.ll.
1 parent 7d52b09 commit 3668970

21 files changed

+371
-74
lines changed

llvm/lib/IR/Instruction.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -920,6 +920,8 @@ bool Instruction::hasSameSpecialState(const Instruction *I2,
920920
if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I1))
921921
return GEP->getSourceElementType() ==
922922
cast<GetElementPtrInst>(I2)->getSourceElementType();
923+
if (const CastInst *Cast = dyn_cast<CastInst>(I1))
924+
return Cast->getDestTy() == cast<CastInst>(I2)->getDestTy();
923925

924926
return true;
925927
}

llvm/lib/Transforms/Utils/FunctionComparator.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -804,6 +804,10 @@ int FunctionComparator::cmpOperations(const Instruction *L,
804804
return Res;
805805
}
806806
}
807+
if (const CastInst *Cast = dyn_cast<CastInst>(L)) {
808+
const CastInst *CastR = cast<CastInst>(R);
809+
return cmpTypes(Cast->getDestTy(), CastR->getDestTy());
810+
}
807811
return 0;
808812
}
809813

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2643,7 +2643,8 @@ namespace {
26432643
struct CSEDenseMapInfo {
26442644
static bool canHandle(const Instruction *I) {
26452645
return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
2646-
isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
2646+
isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I) ||
2647+
isa<CastInst>(I) || isa<SelectInst>(I);
26472648
}
26482649

26492650
static inline Instruction *getEmptyKey() {

llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll

Lines changed: 7 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -47,10 +47,8 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
4747
; DEFAULT-NEXT: [[TMP27:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i16>
4848
; DEFAULT-NEXT: [[TMP28:%.*]] = mul <vscale x 8 x i16> [[TMP26]], [[TMP13]]
4949
; DEFAULT-NEXT: [[TMP29:%.*]] = mul <vscale x 8 x i16> [[TMP27]], [[TMP13]]
50-
; DEFAULT-NEXT: [[TMP30:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i16>
51-
; DEFAULT-NEXT: [[TMP31:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i16>
52-
; DEFAULT-NEXT: [[TMP32:%.*]] = or <vscale x 8 x i16> [[TMP28]], [[TMP30]]
53-
; DEFAULT-NEXT: [[TMP33:%.*]] = or <vscale x 8 x i16> [[TMP29]], [[TMP31]]
50+
; DEFAULT-NEXT: [[TMP32:%.*]] = or <vscale x 8 x i16> [[TMP28]], [[TMP26]]
51+
; DEFAULT-NEXT: [[TMP33:%.*]] = or <vscale x 8 x i16> [[TMP29]], [[TMP27]]
5452
; DEFAULT-NEXT: [[TMP34:%.*]] = lshr <vscale x 8 x i16> [[TMP32]], splat (i16 1)
5553
; DEFAULT-NEXT: [[TMP35:%.*]] = lshr <vscale x 8 x i16> [[TMP33]], splat (i16 1)
5654
; DEFAULT-NEXT: [[TMP36:%.*]] = trunc <vscale x 8 x i16> [[TMP34]] to <vscale x 8 x i8>
@@ -129,8 +127,7 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
129127
; PRED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP19]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
130128
; PRED-NEXT: [[TMP17:%.*]] = zext <vscale x 16 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 16 x i16>
131129
; PRED-NEXT: [[TMP22:%.*]] = mul <vscale x 16 x i16> [[TMP17]], [[TMP16]]
132-
; PRED-NEXT: [[TMP24:%.*]] = zext <vscale x 16 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 16 x i16>
133-
; PRED-NEXT: [[TMP20:%.*]] = or <vscale x 16 x i16> [[TMP22]], [[TMP24]]
130+
; PRED-NEXT: [[TMP20:%.*]] = or <vscale x 16 x i16> [[TMP22]], [[TMP17]]
134131
; PRED-NEXT: [[TMP21:%.*]] = lshr <vscale x 16 x i16> [[TMP20]], splat (i16 1)
135132
; PRED-NEXT: [[TMP23:%.*]] = trunc <vscale x 16 x i16> [[TMP21]] to <vscale x 16 x i8>
136133
; PRED-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]]
@@ -385,9 +382,8 @@ define void @trunc_ivs_and_store(i32 %x, ptr %dst, i64 %N) #0 {
385382
; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
386383
; DEFAULT-NEXT: [[OFFSET_IDX:%.*]] = trunc i64 [[INDEX]] to i32
387384
; DEFAULT-NEXT: [[TMP15:%.*]] = add i32 [[OFFSET_IDX]], 1
388-
; DEFAULT-NEXT: [[TMP16:%.*]] = trunc i64 [[INDEX]] to i32
389-
; DEFAULT-NEXT: [[TMP18:%.*]] = add i32 [[TMP16]], 1
390-
; DEFAULT-NEXT: [[TMP19:%.*]] = mul i32 [[MUL]], [[TMP16]]
385+
; DEFAULT-NEXT: [[TMP18:%.*]] = add i32 [[OFFSET_IDX]], 1
386+
; DEFAULT-NEXT: [[TMP19:%.*]] = mul i32 [[MUL]], [[OFFSET_IDX]]
391387
; DEFAULT-NEXT: [[TMP20:%.*]] = mul i32 [[MUL]], [[TMP18]]
392388
; DEFAULT-NEXT: [[TMP21:%.*]] = zext i32 [[TMP19]] to i64
393389
; DEFAULT-NEXT: [[TMP22:%.*]] = zext i32 [[TMP20]] to i64
@@ -580,9 +576,8 @@ define void @ivs_trunc_and_ext(i32 %x, ptr %dst, i64 %N) #0 {
580576
; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
581577
; DEFAULT-NEXT: [[OFFSET_IDX:%.*]] = trunc i64 [[INDEX]] to i32
582578
; DEFAULT-NEXT: [[TMP14:%.*]] = add i32 [[OFFSET_IDX]], 1
583-
; DEFAULT-NEXT: [[TMP15:%.*]] = trunc i64 [[INDEX]] to i32
584-
; DEFAULT-NEXT: [[TMP17:%.*]] = add i32 [[TMP15]], 1
585-
; DEFAULT-NEXT: [[TMP18:%.*]] = mul i32 [[ADD]], [[TMP15]]
579+
; DEFAULT-NEXT: [[TMP17:%.*]] = add i32 [[OFFSET_IDX]], 1
580+
; DEFAULT-NEXT: [[TMP18:%.*]] = mul i32 [[ADD]], [[OFFSET_IDX]]
586581
; DEFAULT-NEXT: [[TMP19:%.*]] = mul i32 [[ADD]], [[TMP17]]
587582
; DEFAULT-NEXT: [[TMP20:%.*]] = zext i32 [[TMP18]] to i64
588583
; DEFAULT-NEXT: [[TMP21:%.*]] = zext i32 [[TMP19]] to i64

llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_test1_no_explicit_vect_width.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,7 @@
2727
; CHECK: %[[AAddr:.*]] = getelementptr inbounds [8 x i32], ptr @arr2, i64 0, <4 x i64> %[[VecInd]]
2828
; CHECK: %[[VecIndTr:.*]] = trunc <4 x i64> %[[VecInd]] to <4 x i32>
2929
; CHECK: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %[[VecIndTr]], <4 x ptr> %[[AAddr]], i32 4, <4 x i1> splat (i1 true))
30-
; CHECK: %[[VecIndTr2:.*]] = trunc <4 x i64> %[[VecInd]] to <4 x i32>
31-
; CHECK: %[[StoreVal:.*]] = add nsw <4 x i32> %[[VecIndTr2]], %[[Splat]]
30+
; CHECK: %[[StoreVal:.*]] = add nsw <4 x i32> %[[VecIndTr]], %[[Splat]]
3231
; CHECK: br label %[[InnerLoop:.+]]
3332

3433
; CHECK: [[InnerLoop]]:

llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1679,8 +1679,7 @@ define i64 @test_std_q31(ptr %x, i32 %n) #0 {
16791679
; CHECK-NEXT: [[TMP2:%.*]] = sext <4 x i32> [[TMP1]] to <4 x i64>
16801680
; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP2]])
16811681
; CHECK-NEXT: [[TMP4]] = add i64 [[TMP3]], [[VEC_PHI]]
1682-
; CHECK-NEXT: [[TMP5:%.*]] = sext <4 x i32> [[TMP1]] to <4 x i64>
1683-
; CHECK-NEXT: [[TMP6:%.*]] = mul nsw <4 x i64> [[TMP5]], [[TMP5]]
1682+
; CHECK-NEXT: [[TMP6:%.*]] = mul nsw <4 x i64> [[TMP2]], [[TMP2]]
16841683
; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP6]])
16851684
; CHECK-NEXT: [[TMP8]] = add i64 [[TMP7]], [[VEC_PHI1]]
16861685
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4

llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-bswap.ll

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,7 @@ define dso_local void @test(ptr %Arr, i32 signext %Len) {
2020
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
2121
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4
2222
; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> [[WIDE_LOAD]])
23-
; CHECK-NEXT: [[TMP5:%.*]] = sext i32 [[INDEX]] to i64
24-
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP5]]
25-
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
26-
; CHECK-NEXT: store <4 x i32> [[TMP4]], ptr [[TMP7]], align 4
23+
; CHECK-NEXT: store <4 x i32> [[TMP4]], ptr [[TMP3]], align 4
2724
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
2825
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
2926
; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]

llvm/test/Transforms/LoopVectorize/RISCV/evl-compatible-loops.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,7 @@ define void @test_wide_integer_induction(ptr noalias %a, i64 %N) {
3535
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[EVL_BASED_IV]]
3636
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[TMP14]], i32 0
3737
; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VEC_IND]], ptr align 8 [[TMP15]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP11]])
38-
; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP11]] to i64
39-
; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]]
38+
; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP12]], [[EVL_BASED_IV]]
4039
; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], [[TMP8]]
4140
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
4241
; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]]

llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -108,9 +108,8 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) {
108108
; RV32-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]])
109109
; RV32-NEXT: [[TMP15:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1)
110110
; RV32-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]]
111-
; RV32-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP5]] to i32
112-
; RV32-NEXT: [[TMP18:%.*]] = mul i32 0, [[TMP17]]
113-
; RV32-NEXT: [[TMP25:%.*]] = sub i32 [[TMP17]], 1
111+
; RV32-NEXT: [[TMP18:%.*]] = mul i32 0, [[TMP10]]
112+
; RV32-NEXT: [[TMP25:%.*]] = sub i32 [[TMP10]], 1
114113
; RV32-NEXT: [[TMP19:%.*]] = mul i32 -1, [[TMP25]]
115114
; RV32-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 [[TMP18]]
116115
; RV32-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i32 [[TMP19]]
@@ -322,9 +321,8 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) {
322321
; RV32-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]])
323322
; RV32-NEXT: [[TMP15:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00)
324323
; RV32-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]]
325-
; RV32-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP5]] to i32
326-
; RV32-NEXT: [[TMP18:%.*]] = mul i32 0, [[TMP17]]
327-
; RV32-NEXT: [[TMP25:%.*]] = sub i32 [[TMP17]], 1
324+
; RV32-NEXT: [[TMP18:%.*]] = mul i32 0, [[TMP10]]
325+
; RV32-NEXT: [[TMP25:%.*]] = sub i32 [[TMP10]], 1
328326
; RV32-NEXT: [[TMP19:%.*]] = mul i32 -1, [[TMP25]]
329327
; RV32-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i32 [[TMP18]]
330328
; RV32-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 [[TMP19]]

llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cast-intrinsics.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -982,8 +982,7 @@ define void @vp_ptrtoint(ptr %a, ptr %b, i64 %N) {
982982
; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[EVL_BASED_IV]]
983983
; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[TMP16]], i32 0
984984
; IF-EVL-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP15]], ptr align 8 [[TMP17]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP11]])
985-
; IF-EVL-NEXT: [[TMP18:%.*]] = zext i32 [[TMP11]] to i64
986-
; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP18]], [[EVL_BASED_IV]]
985+
; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP12]], [[EVL_BASED_IV]]
987986
; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
988987
; IF-EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
989988
; IF-EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]

0 commit comments

Comments
 (0)