Skip to content

Commit 0e1f4d4

Browse files
committed
[SLP]Improve reductions analysis and emission, part 1.
Currently SLP vectorizer walks through the instructions and selects 3 main classes of values: 1) reduction operations - instructions with same reduction opcode (add, mul, min/max, etc.), which build the reduction, 2) reduced values - instructions with the same opcodes, but different from the reduction opcode, 3) extra arguments - all other values, instructions from the different basic block rather than the root node, instructions with to many/less uses. This scheme is not very efficient. It excludes some instructions and all non-instruction values from the reductions (constants, proficient gathers), to many possibly reduced values are marked as extra arguments. Patch improves this process by introducing a bit extended analysis stage. During this stage, we still try to select 3 classes of the values: 1) reduction operations - same as before, 2) possibly reduced values - all instructions from the current block/non-instructions, which may build a vectorization tree, 3) extra arguments - instructions from the different basic blocks. Additionally, an extra sorting of the possibly reduced values occurs to build the scalar sequences which highly likely will bed vectorized, e.g. loads are grouped by the distance between them, constants are grouped together, cmp instructions are sorted by their compare types and predicates, extractelement instructions are sorted by the vector operand, etc. Also, these groups are reordered by their length so the longest group is the first in the list of the possibly reduced values. The vectorization process tries to emit the reductions for all these groups. These reductions, remaining non-vectorized possible reduced values and extra arguments are then combined into the final expression just like it was before. Differential Revision: https://reviews.llvm.org/D114171
1 parent c986d47 commit 0e1f4d4

21 files changed

+837
-767
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 441 additions & 238 deletions
Large diffs are not rendered by default.

llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll

Lines changed: 19 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -90,23 +90,16 @@ return:
9090
define float @test_merge_anyof_v4sf(<4 x float> %t) {
9191
; CHECK-LABEL: @test_merge_anyof_v4sf(
9292
; CHECK-NEXT: entry:
93-
; CHECK-NEXT: [[T_FR:%.*]] = freeze <4 x float> [[T:%.*]]
94-
; CHECK-NEXT: [[TMP0:%.*]] = fcmp olt <4 x float> [[T_FR]], zeroinitializer
95-
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i1> [[TMP0]] to i4
96-
; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i4 [[TMP1]], 0
97-
; CHECK-NEXT: [[TMP3:%.*]] = fcmp ogt <4 x float> [[T_FR]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
98-
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP3]], i64 0
99-
; CHECK-NEXT: [[OR_COND3:%.*]] = or i1 [[TMP2]], [[TMP4]]
100-
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP3]], i64 1
101-
; CHECK-NEXT: [[OR_COND4:%.*]] = or i1 [[OR_COND3]], [[TMP5]]
102-
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i1> [[TMP3]], i64 2
103-
; CHECK-NEXT: [[OR_COND5:%.*]] = or i1 [[OR_COND4]], [[TMP6]]
104-
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP3]], i64 3
105-
; CHECK-NEXT: [[OR_COND6:%.*]] = or i1 [[OR_COND5]], [[TMP7]]
106-
; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[T_FR]], <4 x float> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
107-
; CHECK-NEXT: [[TMP8:%.*]] = fadd <4 x float> [[SHIFT]], [[T_FR]]
108-
; CHECK-NEXT: [[ADD:%.*]] = extractelement <4 x float> [[TMP8]], i64 0
109-
; CHECK-NEXT: [[RETVAL_0:%.*]] = select i1 [[OR_COND6]], float 0.000000e+00, float [[ADD]]
93+
; CHECK-NEXT: [[T_FR7:%.*]] = freeze <4 x float> [[T:%.*]]
94+
; CHECK-NEXT: [[TMP0:%.*]] = fcmp olt <4 x float> [[T_FR7]], zeroinitializer
95+
; CHECK-NEXT: [[TMP1:%.*]] = fcmp ogt <4 x float> [[T_FR7]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
96+
; CHECK-NEXT: [[TMP2:%.*]] = or <4 x i1> [[TMP1]], [[TMP0]]
97+
; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i1> [[TMP2]] to i4
98+
; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i4 [[TMP3]], 0
99+
; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[T_FR7]], <4 x float> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
100+
; CHECK-NEXT: [[TMP4:%.*]] = fadd <4 x float> [[SHIFT]], [[T_FR7]]
101+
; CHECK-NEXT: [[ADD:%.*]] = extractelement <4 x float> [[TMP4]], i64 0
102+
; CHECK-NEXT: [[RETVAL_0:%.*]] = select i1 [[DOTNOT]], float [[ADD]], float 0.000000e+00
110103
; CHECK-NEXT: ret float [[RETVAL_0]]
111104
;
112105
entry:
@@ -419,24 +412,16 @@ return:
419412
define float @test_merge_anyof_v4si(<4 x i32> %t) {
420413
; CHECK-LABEL: @test_merge_anyof_v4si(
421414
; CHECK-NEXT: entry:
422-
; CHECK-NEXT: [[T_FR:%.*]] = freeze <4 x i32> [[T:%.*]]
423-
; CHECK-NEXT: [[TMP0:%.*]] = icmp slt <4 x i32> [[T_FR]], <i32 1, i32 1, i32 1, i32 1>
424-
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i1> [[TMP0]] to i4
425-
; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i4 [[TMP1]], 0
426-
; CHECK-NEXT: [[TMP3:%.*]] = icmp sgt <4 x i32> [[T_FR]], <i32 255, i32 255, i32 255, i32 255>
427-
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP3]], i64 0
428-
; CHECK-NEXT: [[OR_COND3:%.*]] = or i1 [[TMP2]], [[TMP4]]
429-
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP3]], i64 1
430-
; CHECK-NEXT: [[OR_COND4:%.*]] = or i1 [[OR_COND3]], [[TMP5]]
431-
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i1> [[TMP3]], i64 2
432-
; CHECK-NEXT: [[OR_COND5:%.*]] = or i1 [[OR_COND4]], [[TMP6]]
433-
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP3]], i64 3
434-
; CHECK-NEXT: [[OR_COND6:%.*]] = or i1 [[OR_COND5]], [[TMP7]]
435-
; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x i32> [[T_FR]], <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
436-
; CHECK-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[SHIFT]], [[T_FR]]
437-
; CHECK-NEXT: [[ADD:%.*]] = extractelement <4 x i32> [[TMP8]], i64 0
415+
; CHECK-NEXT: [[T_FR7:%.*]] = freeze <4 x i32> [[T:%.*]]
416+
; CHECK-NEXT: [[TMP0:%.*]] = add <4 x i32> [[T_FR7]], <i32 -256, i32 -256, i32 -256, i32 -256>
417+
; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <4 x i32> [[TMP0]], <i32 -255, i32 -255, i32 -255, i32 -255>
418+
; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i1> [[TMP1]] to i4
419+
; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i4 [[TMP2]], 0
420+
; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x i32> [[T_FR7]], <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
421+
; CHECK-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[SHIFT]], [[T_FR7]]
422+
; CHECK-NEXT: [[ADD:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
438423
; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[ADD]] to float
439-
; CHECK-NEXT: [[RETVAL_0:%.*]] = select i1 [[OR_COND6]], float 0.000000e+00, float [[CONV]]
424+
; CHECK-NEXT: [[RETVAL_0:%.*]] = select i1 [[DOTNOT]], float [[CONV]], float 0.000000e+00
440425
; CHECK-NEXT: ret float [[RETVAL_0]]
441426
;
442427
entry:

llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ define i32 @ext_ext_partial_add_reduction_v4i32(<4 x i32> %x) {
4141

4242
define i32 @ext_ext_partial_add_reduction_and_extra_add_v4i32(<4 x i32> %x, <4 x i32> %y) {
4343
; CHECK-LABEL: @ext_ext_partial_add_reduction_and_extra_add_v4i32(
44-
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> <i32 4, i32 2, i32 5, i32 6>
44+
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[Y:%.*]], <4 x i32> [[X:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 6>
4545
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]])
4646
; CHECK-NEXT: ret i32 [[TMP2]]
4747
;

llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,10 @@ define void @PR28330(i32 %n) {
1515
; DEFAULT-NEXT: [[TMP1:%.*]] = icmp eq <8 x i8> [[TMP0]], zeroinitializer
1616
; DEFAULT-NEXT: br label [[FOR_BODY:%.*]]
1717
; DEFAULT: for.body:
18-
; DEFAULT-NEXT: [[P17:%.*]] = phi i32 [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
18+
; DEFAULT-NEXT: [[P17:%.*]] = phi i32 [ [[OP_RDX:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
1919
; DEFAULT-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> <i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720>, <8 x i32> <i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80>
2020
; DEFAULT-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]])
21-
; DEFAULT-NEXT: [[OP_EXTRA]] = add i32 [[TMP3]], [[P17]]
21+
; DEFAULT-NEXT: [[OP_RDX]] = add i32 [[TMP3]], [[P17]]
2222
; DEFAULT-NEXT: br label [[FOR_BODY]]
2323
;
2424
; GATHER-LABEL: @PR28330(
@@ -27,10 +27,10 @@ define void @PR28330(i32 %n) {
2727
; GATHER-NEXT: [[TMP1:%.*]] = icmp eq <8 x i8> [[TMP0]], zeroinitializer
2828
; GATHER-NEXT: br label [[FOR_BODY:%.*]]
2929
; GATHER: for.body:
30-
; GATHER-NEXT: [[P17:%.*]] = phi i32 [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
30+
; GATHER-NEXT: [[P17:%.*]] = phi i32 [ [[OP_RDX:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
3131
; GATHER-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> <i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720>, <8 x i32> <i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80>
3232
; GATHER-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]])
33-
; GATHER-NEXT: [[OP_EXTRA]] = add i32 [[TMP3]], [[P17]]
33+
; GATHER-NEXT: [[OP_RDX]] = add i32 [[TMP3]], [[P17]]
3434
; GATHER-NEXT: br label [[FOR_BODY]]
3535
;
3636
; MAX-COST-LABEL: @PR28330(
@@ -39,10 +39,10 @@ define void @PR28330(i32 %n) {
3939
; MAX-COST-NEXT: [[TMP1:%.*]] = icmp eq <8 x i8> [[TMP0]], zeroinitializer
4040
; MAX-COST-NEXT: br label [[FOR_BODY:%.*]]
4141
; MAX-COST: for.body:
42-
; MAX-COST-NEXT: [[P17:%.*]] = phi i32 [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
42+
; MAX-COST-NEXT: [[P17:%.*]] = phi i32 [ [[OP_RDX:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
4343
; MAX-COST-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> <i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720>, <8 x i32> <i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80>
4444
; MAX-COST-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]])
45-
; MAX-COST-NEXT: [[OP_EXTRA]] = add i32 [[TMP3]], [[P17]]
45+
; MAX-COST-NEXT: [[OP_RDX]] = add i32 [[TMP3]], [[P17]]
4646
; MAX-COST-NEXT: br label [[FOR_BODY]]
4747
;
4848
entry:
@@ -92,10 +92,10 @@ define void @PR32038(i32 %n) {
9292
; DEFAULT-NEXT: [[TMP1:%.*]] = icmp eq <8 x i8> [[TMP0]], zeroinitializer
9393
; DEFAULT-NEXT: br label [[FOR_BODY:%.*]]
9494
; DEFAULT: for.body:
95-
; DEFAULT-NEXT: [[P17:%.*]] = phi i32 [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
95+
; DEFAULT-NEXT: [[P17:%.*]] = phi i32 [ [[OP_RDX:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
9696
; DEFAULT-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> <i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720>, <8 x i32> <i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80>
9797
; DEFAULT-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]])
98-
; DEFAULT-NEXT: [[OP_EXTRA]] = add i32 [[TMP3]], -5
98+
; DEFAULT-NEXT: [[OP_RDX]] = add i32 [[TMP3]], -5
9999
; DEFAULT-NEXT: br label [[FOR_BODY]]
100100
;
101101
; GATHER-LABEL: @PR32038(
@@ -104,10 +104,10 @@ define void @PR32038(i32 %n) {
104104
; GATHER-NEXT: [[TMP1:%.*]] = icmp eq <8 x i8> [[TMP0]], zeroinitializer
105105
; GATHER-NEXT: br label [[FOR_BODY:%.*]]
106106
; GATHER: for.body:
107-
; GATHER-NEXT: [[P17:%.*]] = phi i32 [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
107+
; GATHER-NEXT: [[P17:%.*]] = phi i32 [ [[OP_RDX:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
108108
; GATHER-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> <i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720>, <8 x i32> <i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80>
109109
; GATHER-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]])
110-
; GATHER-NEXT: [[OP_EXTRA]] = add i32 [[TMP3]], -5
110+
; GATHER-NEXT: [[OP_RDX]] = add i32 [[TMP3]], -5
111111
; GATHER-NEXT: br label [[FOR_BODY]]
112112
;
113113
; MAX-COST-LABEL: @PR32038(
@@ -116,10 +116,10 @@ define void @PR32038(i32 %n) {
116116
; MAX-COST-NEXT: [[TMP1:%.*]] = icmp eq <8 x i8> [[TMP0]], zeroinitializer
117117
; MAX-COST-NEXT: br label [[FOR_BODY:%.*]]
118118
; MAX-COST: for.body:
119-
; MAX-COST-NEXT: [[P17:%.*]] = phi i32 [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
119+
; MAX-COST-NEXT: [[P17:%.*]] = phi i32 [ [[OP_RDX:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
120120
; MAX-COST-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> <i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720>, <8 x i32> <i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80>
121121
; MAX-COST-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]])
122-
; MAX-COST-NEXT: [[OP_EXTRA]] = add i32 [[TMP3]], -5
122+
; MAX-COST-NEXT: [[OP_RDX]] = add i32 [[TMP3]], -5
123123
; MAX-COST-NEXT: br label [[FOR_BODY]]
124124
;
125125
entry:

llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -204,10 +204,10 @@ define <4 x i32> @build_vec_v4i32_3_binops(<2 x i32> %v0, <2 x i32> %v1) {
204204

205205
define i32 @reduction_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
206206
; CHECK-LABEL: @reduction_v4i32(
207-
; CHECK-NEXT: [[TMP1:%.*]] = sub <4 x i32> [[V0:%.*]], [[V1:%.*]]
208-
; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[V0]], [[V1]]
209-
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 7, i32 2>
210-
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 1, i32 4, i32 6, i32 3>
207+
; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[V0:%.*]], [[V1:%.*]]
208+
; CHECK-NEXT: [[TMP2:%.*]] = sub <4 x i32> [[V0]], [[V1]]
209+
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 3, i32 6>
210+
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 1, i32 4, i32 2, i32 7>
211211
; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[TMP3]]
212212
; CHECK-NEXT: [[TMP6:%.*]] = lshr <4 x i32> [[TMP5]], <i32 15, i32 15, i32 15, i32 15>
213213
; CHECK-NEXT: [[TMP7:%.*]] = and <4 x i32> [[TMP6]], <i32 65537, i32 65537, i32 65537, i32 65537>

llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -204,10 +204,10 @@ define <4 x i32> @build_vec_v4i32_3_binops(<2 x i32> %v0, <2 x i32> %v1) {
204204

205205
define i32 @reduction_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
206206
; CHECK-LABEL: @reduction_v4i32(
207-
; CHECK-NEXT: [[TMP1:%.*]] = sub <4 x i32> [[V0:%.*]], [[V1:%.*]]
208-
; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[V0]], [[V1]]
209-
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 7, i32 2>
210-
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 1, i32 4, i32 6, i32 3>
207+
; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[V0:%.*]], [[V1:%.*]]
208+
; CHECK-NEXT: [[TMP2:%.*]] = sub <4 x i32> [[V0]], [[V1]]
209+
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 3, i32 6>
210+
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 1, i32 4, i32 2, i32 7>
211211
; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[TMP3]]
212212
; CHECK-NEXT: [[TMP6:%.*]] = lshr <4 x i32> [[TMP5]], <i32 15, i32 15, i32 15, i32 15>
213213
; CHECK-NEXT: [[TMP7:%.*]] = and <4 x i32> [[TMP6]], <i32 65537, i32 65537, i32 65537, i32 65537>

llvm/test/Transforms/SLPVectorizer/X86/PR35628_1.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ define void @mainTest(i32* %ptr) #0 {
88
; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32* [[PTR:%.*]], null
99
; CHECK-NEXT: br i1 [[CMP]], label [[LOOP:%.*]], label [[BAIL_OUT:%.*]]
1010
; CHECK: loop:
11-
; CHECK-NEXT: [[DUMMY_PHI:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[OP_EXTRA3:%.*]], [[LOOP]] ]
11+
; CHECK-NEXT: [[DUMMY_PHI:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[OP_RDX3:%.*]], [[LOOP]] ]
1212
; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[PTR]] to <4 x i32>*
1313
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
1414
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
@@ -17,10 +17,10 @@ define void @mainTest(i32* %ptr) #0 {
1717
; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i32> [[TMP1]], [[TMP1]]
1818
; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[TMP3]] to i64
1919
; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]])
20-
; CHECK-NEXT: [[OP_EXTRA:%.*]] = add i32 [[TMP7]], 1
21-
; CHECK-NEXT: [[OP_EXTRA1:%.*]] = add i32 [[OP_EXTRA]], [[TMP4]]
22-
; CHECK-NEXT: [[OP_EXTRA2:%.*]] = add i32 [[OP_EXTRA1]], [[TMP3]]
23-
; CHECK-NEXT: [[OP_EXTRA3]] = add i32 [[OP_EXTRA2]], [[TMP2]]
20+
; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP7]], [[TMP4]]
21+
; CHECK-NEXT: [[OP_RDX1:%.*]] = add i32 [[OP_RDX]], [[TMP3]]
22+
; CHECK-NEXT: [[OP_RDX2:%.*]] = add i32 [[OP_RDX1]], [[TMP2]]
23+
; CHECK-NEXT: [[OP_RDX3]] = add i32 [[OP_RDX2]], 1
2424
; CHECK-NEXT: br label [[LOOP]]
2525
; CHECK: bail_out:
2626
; CHECK-NEXT: ret void

llvm/test/Transforms/SLPVectorizer/X86/PR35628_2.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,20 +7,20 @@ define void @test() #0 {
77
; CHECK-NEXT: entry:
88
; CHECK-NEXT: br label [[LOOP:%.*]]
99
; CHECK: loop:
10-
; CHECK-NEXT: [[DUMMY_PHI:%.*]] = phi i64 [ 1, [[ENTRY:%.*]] ], [ [[OP_EXTRA1:%.*]], [[LOOP]] ]
10+
; CHECK-NEXT: [[DUMMY_PHI:%.*]] = phi i64 [ 1, [[ENTRY:%.*]] ], [ [[OP_RDX1:%.*]], [[LOOP]] ]
1111
; CHECK-NEXT: [[TMP0:%.*]] = phi i64 [ 2, [[ENTRY]] ], [ [[TMP3:%.*]], [[LOOP]] ]
1212
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i64> poison, i64 [[TMP0]], i32 0
1313
; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> poison, <4 x i32> zeroinitializer
14-
; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i64> [[SHUFFLE]], <i64 3, i64 2, i64 1, i64 0>
14+
; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i64> [[SHUFFLE]], <i64 2, i64 3, i64 1, i64 0>
1515
; CHECK-NEXT: [[TMP3]] = extractelement <4 x i64> [[TMP2]], i32 3
1616
; CHECK-NEXT: [[DUMMY_ADD:%.*]] = add i16 0, 0
17-
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP2]], i32 0
17+
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1
1818
; CHECK-NEXT: [[DUMMY_SHL:%.*]] = shl i64 [[TMP4]], 32
1919
; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i64> <i64 1, i64 1, i64 1, i64 1>, [[TMP2]]
2020
; CHECK-NEXT: [[TMP6:%.*]] = ashr exact <4 x i64> [[TMP5]], <i64 32, i64 32, i64 32, i64 32>
2121
; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP6]])
22-
; CHECK-NEXT: [[OP_EXTRA:%.*]] = add i64 [[TMP7]], 0
23-
; CHECK-NEXT: [[OP_EXTRA1]] = add i64 [[OP_EXTRA]], [[TMP3]]
22+
; CHECK-NEXT: [[OP_RDX:%.*]] = add i64 [[TMP7]], [[TMP3]]
23+
; CHECK-NEXT: [[OP_RDX1]] = add i64 [[OP_RDX]], 0
2424
; CHECK-NEXT: br label [[LOOP]]
2525
;
2626
entry:

0 commit comments

Comments
 (0)