Skip to content

Commit 418c218

Browse files
committed
Return "[Codegenprepare][X86] Use usub with overflow opt for IV increment"
The patch did not account for one corner case where cmp does not dominate the loop latch. This patch adds this check, hopefully it's cheap because the CFG does not change during the transform, so DT queries should be executed quickly. If you see compile time slowness from this, please revert. Differential Revision: https://reviews.llvm.org/D96119
1 parent 78717f5 commit 418c218

File tree

4 files changed

+73
-37
lines changed

4 files changed

+73
-37
lines changed

llvm/lib/CodeGen/CodeGenPrepare.cpp

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1284,7 +1284,34 @@ bool CodeGenPrepare::replaceMathCmpWithIntrinsic(BinaryOperator *BO,
12841284
Value *Arg0, Value *Arg1,
12851285
CmpInst *Cmp,
12861286
Intrinsic::ID IID) {
1287-
if (BO->getParent() != Cmp->getParent()) {
1287+
auto isIVIncrement = [this, &Cmp](BinaryOperator *BO) {
1288+
auto *PN = dyn_cast<PHINode>(BO->getOperand(0));
1289+
if (!PN)
1290+
return false;
1291+
const Loop *L = LI->getLoopFor(BO->getParent());
1292+
if (!L || L->getHeader() != PN->getParent() || !L->getLoopLatch())
1293+
return false;
1294+
const BasicBlock *Latch = L->getLoopLatch();
1295+
if (PN->getIncomingValueForBlock(Latch) != BO)
1296+
return false;
1297+
if (auto *Step = dyn_cast<Instruction>(BO->getOperand(1)))
1298+
if (L->contains(Step->getParent()))
1299+
return false;
1300+
// IV increment may have other users than the IV. We do not want to make
1301+
// dominance queries to analyze the legality of moving it towards the cmp,
1302+
// so just check that there is no other users.
1303+
if (!BO->hasOneUse())
1304+
return false;
1305+
// Do not risk on moving increment into a child loop.
1306+
if (LI->getLoopFor(Cmp->getParent()) != L)
1307+
return false;
1308+
// Ultimately, the insertion point must dominate latch. This should be a
1309+
// cheap check because no CFG changes & dom tree recomputation happens
1310+
// during the transform.
1311+
Function *F = BO->getParent()->getParent();
1312+
return getDT(*F).dominates(Cmp->getParent(), Latch);
1313+
};
1314+
if (BO->getParent() != Cmp->getParent() && !isIVIncrement(BO)) {
12881315
// We used to use a dominator tree here to allow multi-block optimization.
12891316
// But that was problematic because:
12901317
// 1. It could cause a perf regression by hoisting the math op into the
@@ -1295,6 +1322,14 @@ bool CodeGenPrepare::replaceMathCmpWithIntrinsic(BinaryOperator *BO,
12951322
// This is because we recompute the DT on every change in the main CGP
12961323
// run-loop. The recomputing is probably unnecessary in many cases, so if
12971324
// that was fixed, using a DT here would be ok.
1325+
//
1326+
// There is one important particular case we still want to handle: if BO is
1327+
// the IV increment. Important properties that make it profitable:
1328+
// - We can speculate IV increment anywhere in the loop (as long as the
1329+
// indvar Phi is its only user);
1330+
// - Upon computing Cmp, we effectively compute something equivalent to the
1331+
// IV increment (despite it loops differently in the IR). So moving it up
1332+
// to the cmp point does not really increase register pressure.
12981333
return false;
12991334
}
13001335

llvm/test/CodeGen/X86/2020_12_02_decrementing_loop.ll

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -89,15 +89,16 @@ failure: ; preds = %backedge
8989
define i32 @test_02(i32* %p, i64 %len, i32 %x) {
9090
; CHECK-LABEL: test_02:
9191
; CHECK: ## %bb.0: ## %entry
92+
; CHECK-NEXT: movq %rsi, %rax
9293
; CHECK-NEXT: .p2align 4, 0x90
9394
; CHECK-NEXT: LBB2_1: ## %loop
9495
; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1
95-
; CHECK-NEXT: testq %rsi, %rsi
96-
; CHECK-NEXT: je LBB2_4
96+
; CHECK-NEXT: subq $1, %rax
97+
; CHECK-NEXT: jb LBB2_4
9798
; CHECK-NEXT: ## %bb.2: ## %backedge
9899
; CHECK-NEXT: ## in Loop: Header=BB2_1 Depth=1
99100
; CHECK-NEXT: cmpl %edx, -4(%rdi,%rsi,4)
100-
; CHECK-NEXT: leaq -1(%rsi), %rsi
101+
; CHECK-NEXT: movq %rax, %rsi
101102
; CHECK-NEXT: jne LBB2_1
102103
; CHECK-NEXT: ## %bb.3: ## %failure
103104
; CHECK-NEXT: ud2
@@ -132,15 +133,16 @@ failure: ; preds = %backedge
132133
define i32 @test_03(i32* %p, i64 %len, i32 %x) {
133134
; CHECK-LABEL: test_03:
134135
; CHECK: ## %bb.0: ## %entry
136+
; CHECK-NEXT: movq %rsi, %rax
135137
; CHECK-NEXT: .p2align 4, 0x90
136138
; CHECK-NEXT: LBB3_1: ## %loop
137139
; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1
138-
; CHECK-NEXT: testq %rsi, %rsi
139-
; CHECK-NEXT: je LBB3_4
140+
; CHECK-NEXT: subq $1, %rax
141+
; CHECK-NEXT: jb LBB3_4
140142
; CHECK-NEXT: ## %bb.2: ## %backedge
141143
; CHECK-NEXT: ## in Loop: Header=BB3_1 Depth=1
142144
; CHECK-NEXT: cmpl %edx, -4(%rdi,%rsi,4)
143-
; CHECK-NEXT: leaq -1(%rsi), %rsi
145+
; CHECK-NEXT: movq %rax, %rsi
144146
; CHECK-NEXT: jne LBB3_1
145147
; CHECK-NEXT: ## %bb.3: ## %failure
146148
; CHECK-NEXT: ud2

llvm/test/CodeGen/X86/lsr-loop-exit-cond.ll

Lines changed: 20 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,11 @@ define void @t(i8* nocapture %in, i8* nocapture %out, i32* nocapture %rk, i32 %r
1616
; GENERIC-NEXT: movl (%rdx), %eax
1717
; GENERIC-NEXT: movl 4(%rdx), %ebx
1818
; GENERIC-NEXT: decl %ecx
19-
; GENERIC-NEXT: leaq 20(%rdx), %r14
19+
; GENERIC-NEXT: leaq 20(%rdx), %r11
2020
; GENERIC-NEXT: movq _Te0@{{.*}}(%rip), %r9
2121
; GENERIC-NEXT: movq _Te1@{{.*}}(%rip), %r8
2222
; GENERIC-NEXT: movq _Te3@{{.*}}(%rip), %r10
23-
; GENERIC-NEXT: movq %rcx, %r11
23+
; GENERIC-NEXT: movq %rcx, %r14
2424
; GENERIC-NEXT: .p2align 4, 0x90
2525
; GENERIC-NEXT: LBB0_1: ## %bb
2626
; GENERIC-NEXT: ## =>This Inner Loop Header: Depth=1
@@ -32,30 +32,29 @@ define void @t(i8* nocapture %in, i8* nocapture %out, i32* nocapture %rk, i32 %r
3232
; GENERIC-NEXT: movzbl %bpl, %ebp
3333
; GENERIC-NEXT: movl (%r8,%rbp,4), %ebp
3434
; GENERIC-NEXT: xorl (%r9,%rax,4), %ebp
35-
; GENERIC-NEXT: xorl -12(%r14), %ebp
35+
; GENERIC-NEXT: xorl -12(%r11), %ebp
3636
; GENERIC-NEXT: shrl $24, %ebx
3737
; GENERIC-NEXT: movl (%r10,%rdi,4), %edi
3838
; GENERIC-NEXT: xorl (%r9,%rbx,4), %edi
39-
; GENERIC-NEXT: xorl -8(%r14), %edi
39+
; GENERIC-NEXT: xorl -8(%r11), %edi
4040
; GENERIC-NEXT: movl %ebp, %eax
4141
; GENERIC-NEXT: shrl $24, %eax
4242
; GENERIC-NEXT: movl (%r9,%rax,4), %eax
43-
; GENERIC-NEXT: testq %r11, %r11
44-
; GENERIC-NEXT: je LBB0_3
43+
; GENERIC-NEXT: subq $1, %r14
44+
; GENERIC-NEXT: jb LBB0_3
4545
; GENERIC-NEXT: ## %bb.2: ## %bb1
4646
; GENERIC-NEXT: ## in Loop: Header=BB0_1 Depth=1
4747
; GENERIC-NEXT: movl %edi, %ebx
4848
; GENERIC-NEXT: shrl $16, %ebx
4949
; GENERIC-NEXT: movzbl %bl, %ebx
5050
; GENERIC-NEXT: xorl (%r8,%rbx,4), %eax
51-
; GENERIC-NEXT: xorl -4(%r14), %eax
51+
; GENERIC-NEXT: xorl -4(%r11), %eax
5252
; GENERIC-NEXT: shrl $24, %edi
5353
; GENERIC-NEXT: movzbl %bpl, %ebx
5454
; GENERIC-NEXT: movl (%r10,%rbx,4), %ebx
5555
; GENERIC-NEXT: xorl (%r9,%rdi,4), %ebx
56-
; GENERIC-NEXT: xorl (%r14), %ebx
57-
; GENERIC-NEXT: decq %r11
58-
; GENERIC-NEXT: addq $16, %r14
56+
; GENERIC-NEXT: xorl (%r11), %ebx
57+
; GENERIC-NEXT: addq $16, %r11
5958
; GENERIC-NEXT: jmp LBB0_1
6059
; GENERIC-NEXT: LBB0_3: ## %bb2
6160
; GENERIC-NEXT: shlq $4, %rcx
@@ -99,12 +98,12 @@ define void @t(i8* nocapture %in, i8* nocapture %out, i32* nocapture %rk, i32 %r
9998
; ATOM-NEXT: ## kill: def $ecx killed $ecx def $rcx
10099
; ATOM-NEXT: movl (%rdx), %r15d
101100
; ATOM-NEXT: movl 4(%rdx), %eax
102-
; ATOM-NEXT: leaq 20(%rdx), %r14
101+
; ATOM-NEXT: leaq 20(%rdx), %r11
103102
; ATOM-NEXT: movq _Te0@{{.*}}(%rip), %r9
104103
; ATOM-NEXT: movq _Te1@{{.*}}(%rip), %r8
105104
; ATOM-NEXT: movq _Te3@{{.*}}(%rip), %r10
106105
; ATOM-NEXT: decl %ecx
107-
; ATOM-NEXT: movq %rcx, %r11
106+
; ATOM-NEXT: movq %rcx, %r14
108107
; ATOM-NEXT: .p2align 4, 0x90
109108
; ATOM-NEXT: LBB0_1: ## %bb
110109
; ATOM-NEXT: ## =>This Inner Loop Header: Depth=1
@@ -118,28 +117,27 @@ define void @t(i8* nocapture %in, i8* nocapture %out, i32* nocapture %rk, i32 %r
118117
; ATOM-NEXT: movzbl %r15b, %edi
119118
; ATOM-NEXT: xorl (%r9,%rbp,4), %ebx
120119
; ATOM-NEXT: movl (%r10,%rdi,4), %edi
121-
; ATOM-NEXT: xorl -12(%r14), %ebx
120+
; ATOM-NEXT: xorl -12(%r11), %ebx
122121
; ATOM-NEXT: xorl (%r9,%rax,4), %edi
123122
; ATOM-NEXT: movl %ebx, %eax
124-
; ATOM-NEXT: xorl -8(%r14), %edi
123+
; ATOM-NEXT: xorl -8(%r11), %edi
125124
; ATOM-NEXT: shrl $24, %eax
126125
; ATOM-NEXT: movl (%r9,%rax,4), %r15d
127-
; ATOM-NEXT: testq %r11, %r11
126+
; ATOM-NEXT: subq $1, %r14
128127
; ATOM-NEXT: movl %edi, %eax
129-
; ATOM-NEXT: je LBB0_3
128+
; ATOM-NEXT: jb LBB0_3
130129
; ATOM-NEXT: ## %bb.2: ## %bb1
131130
; ATOM-NEXT: ## in Loop: Header=BB0_1 Depth=1
132131
; ATOM-NEXT: shrl $16, %eax
133132
; ATOM-NEXT: shrl $24, %edi
134-
; ATOM-NEXT: decq %r11
135-
; ATOM-NEXT: movzbl %al, %ebp
133+
; ATOM-NEXT: movzbl %al, %eax
134+
; ATOM-NEXT: xorl (%r8,%rax,4), %r15d
136135
; ATOM-NEXT: movzbl %bl, %eax
137136
; ATOM-NEXT: movl (%r10,%rax,4), %eax
138-
; ATOM-NEXT: xorl (%r8,%rbp,4), %r15d
137+
; ATOM-NEXT: xorl -4(%r11), %r15d
139138
; ATOM-NEXT: xorl (%r9,%rdi,4), %eax
140-
; ATOM-NEXT: xorl -4(%r14), %r15d
141-
; ATOM-NEXT: xorl (%r14), %eax
142-
; ATOM-NEXT: addq $16, %r14
139+
; ATOM-NEXT: xorl (%r11), %eax
140+
; ATOM-NEXT: addq $16, %r11
143141
; ATOM-NEXT: jmp LBB0_1
144142
; ATOM-NEXT: LBB0_3: ## %bb2
145143
; ATOM-NEXT: shrl $16, %eax

llvm/test/CodeGen/X86/usub_inc_iv.ll

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -102,18 +102,19 @@ define i32 @test_02(i32* %p, i64 %len, i32 %x) {
102102
; CHECK-NEXT: entry:
103103
; CHECK-NEXT: br label [[LOOP:%.*]]
104104
; CHECK: loop:
105-
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ], [ [[LEN:%.*]], [[ENTRY:%.*]] ]
106-
; CHECK-NEXT: [[COND_1:%.*]] = icmp eq i64 [[IV]], 0
107-
; CHECK-NEXT: br i1 [[COND_1]], label [[EXIT:%.*]], label [[BACKEDGE]]
105+
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[MATH:%.*]], [[BACKEDGE:%.*]] ], [ [[LEN:%.*]], [[ENTRY:%.*]] ]
106+
; CHECK-NEXT: [[TMP0:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[IV]], i64 1)
107+
; CHECK-NEXT: [[MATH]] = extractvalue { i64, i1 } [[TMP0]], 0
108+
; CHECK-NEXT: [[OV:%.*]] = extractvalue { i64, i1 } [[TMP0]], 1
109+
; CHECK-NEXT: br i1 [[OV]], label [[EXIT:%.*]], label [[BACKEDGE]]
108110
; CHECK: backedge:
109111
; CHECK-NEXT: [[SUNKADDR:%.*]] = mul i64 [[IV]], 4
110-
; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[P:%.*]] to i8*
111-
; CHECK-NEXT: [[SUNKADDR1:%.*]] = getelementptr i8, i8* [[TMP0]], i64 [[SUNKADDR]]
112+
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[P:%.*]] to i8*
113+
; CHECK-NEXT: [[SUNKADDR1:%.*]] = getelementptr i8, i8* [[TMP1]], i64 [[SUNKADDR]]
112114
; CHECK-NEXT: [[SUNKADDR2:%.*]] = getelementptr i8, i8* [[SUNKADDR1]], i64 -4
113-
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[SUNKADDR2]] to i32*
114-
; CHECK-NEXT: [[LOADED:%.*]] = load atomic i32, i32* [[TMP1]] unordered, align 4
115+
; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[SUNKADDR2]] to i32*
116+
; CHECK-NEXT: [[LOADED:%.*]] = load atomic i32, i32* [[TMP2]] unordered, align 4
115117
; CHECK-NEXT: [[COND_2:%.*]] = icmp eq i32 [[LOADED]], [[X:%.*]]
116-
; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], -1
117118
; CHECK-NEXT: br i1 [[COND_2]], label [[FAILURE:%.*]], label [[LOOP]]
118119
; CHECK: exit:
119120
; CHECK-NEXT: ret i32 -1

0 commit comments

Comments
 (0)