Skip to content

Commit 6b852ff

Browse files
committed
[Sink] Process basic blocks with a single successor
This condition seems unnecessary. Reviewed By: arsenm Differential Revision: https://reviews.llvm.org/D93511
1 parent dcb71b5 commit 6b852ff

File tree

12 files changed

+435
-425
lines changed

12 files changed

+435
-425
lines changed

llvm/lib/Transforms/Scalar/Sink.cpp

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -174,9 +174,6 @@ static bool SinkInstruction(Instruction *Inst,
174174

175175
static bool ProcessBlock(BasicBlock &BB, DominatorTree &DT, LoopInfo &LI,
176176
AAResults &AA) {
177-
// Can't sink anything out of a block that has less than two successors.
178-
if (BB.getTerminator()->getNumSuccessors() <= 1) return false;
179-
180177
// Don't bother sinking code out of unreachable blocks. In addition to being
181178
// unprofitable, it can also lead to infinite looping, because in an
182179
// unreachable loop there may be nowhere to stop.

llvm/test/CodeGen/AMDGPU/andorbitset.ll

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,13 +48,17 @@ define amdgpu_kernel void @s_set_midbit(i32 addrspace(1)* %out, i32 %in) {
4848
ret void
4949
}
5050

51+
@gv = external addrspace(1) global i32
52+
5153
; Make sure there's no verifier error with an undef source.
5254
; SI-LABEL: {{^}}bitset_verifier_error:
53-
; SI: s_bitset0_b32 s{{[0-9]+}}, 31
55+
; SI-NOT: %bb.1:
56+
; SI: s_bitset0_b32 s{{[0-9]+}}, 31
5457
define void @bitset_verifier_error() local_unnamed_addr #0 {
5558
bb:
5659
%i = call float @llvm.fabs.f32(float undef) #0
5760
%i1 = bitcast float %i to i32
61+
store i32 %i1, i32 addrspace(1)* @gv
5862
br label %bb2
5963

6064
bb2:

llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll

Lines changed: 28 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -120,25 +120,27 @@ define protected amdgpu_kernel void @nand(i32 addrspace(1)* %p, %S addrspace(1)*
120120
; CHECK: ; %bb.0:
121121
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
122122
; CHECK-NEXT: s_mov_b64 s[4:5], 0
123-
; CHECK-NEXT: v_mov_b32_e32 v0, 0
123+
; CHECK-NEXT: v_mov_b32_e32 v1, 0
124124
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
125125
; CHECK-NEXT: s_load_dword s6, s[0:1], 0x0
126126
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
127-
; CHECK-NEXT: v_mov_b32_e32 v1, s6
127+
; CHECK-NEXT: v_mov_b32_e32 v0, s6
128128
; CHECK-NEXT: .LBB5_1: ; %atomicrmw.start
129129
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
130-
; CHECK-NEXT: v_mov_b32_e32 v3, v1
131-
; CHECK-NEXT: v_not_b32_e32 v1, v3
132-
; CHECK-NEXT: v_or_b32_e32 v2, -2, v1
133-
; CHECK-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] glc
130+
; CHECK-NEXT: v_mov_b32_e32 v3, v0
131+
; CHECK-NEXT: v_not_b32_e32 v0, v3
132+
; CHECK-NEXT: v_or_b32_e32 v2, -2, v0
133+
; CHECK-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc
134134
; CHECK-NEXT: s_waitcnt vmcnt(0)
135-
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
135+
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
136136
; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
137137
; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5]
138138
; CHECK-NEXT: s_cbranch_execnz .LBB5_1
139139
; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end
140140
; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
141-
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, 12, s[2:3]
141+
; CHECK-NEXT: v_mov_b32_e32 v2, s2
142+
; CHECK-NEXT: v_mov_b32_e32 v3, s3
143+
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3]
142144
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
143145
; CHECK-NEXT: global_store_dword v[0:1], v2, off
144146
; CHECK-NEXT: s_endpgm
@@ -330,26 +332,28 @@ define protected amdgpu_kernel void @fadd(float addrspace(1)* %p, %S addrspace(1
330332
; CHECK: ; %bb.0:
331333
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
332334
; CHECK-NEXT: s_mov_b64 s[4:5], 0
333-
; CHECK-NEXT: v_mov_b32_e32 v0, 0
335+
; CHECK-NEXT: v_mov_b32_e32 v1, 0
334336
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
335337
; CHECK-NEXT: s_load_dword s6, s[0:1], 0x0
336338
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
337-
; CHECK-NEXT: v_mov_b32_e32 v1, s6
339+
; CHECK-NEXT: v_mov_b32_e32 v0, s6
338340
; CHECK-NEXT: .LBB14_1: ; %atomicrmw.start
339341
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
340-
; CHECK-NEXT: v_mov_b32_e32 v3, v1
342+
; CHECK-NEXT: v_mov_b32_e32 v3, v0
341343
; CHECK-NEXT: v_add_f32_e32 v2, 1.0, v3
342-
; CHECK-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] glc
344+
; CHECK-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc
343345
; CHECK-NEXT: s_waitcnt vmcnt(0)
344-
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
346+
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
345347
; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
346348
; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5]
347349
; CHECK-NEXT: s_cbranch_execnz .LBB14_1
348350
; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end
349351
; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
350-
; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v1
352+
; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v0
353+
; CHECK-NEXT: v_mov_b32_e32 v0, s2
354+
; CHECK-NEXT: v_mov_b32_e32 v1, s3
355+
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
351356
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
352-
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[2:3]
353357
; CHECK-NEXT: global_store_dword v[0:1], v2, off
354358
; CHECK-NEXT: s_endpgm
355359
%f32 = atomicrmw fadd float addrspace(1)* %p, float 1.0 monotonic
@@ -365,26 +369,28 @@ define protected amdgpu_kernel void @fsub(float addrspace(1)* %p, %S addrspace(1
365369
; CHECK: ; %bb.0:
366370
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
367371
; CHECK-NEXT: s_mov_b64 s[4:5], 0
368-
; CHECK-NEXT: v_mov_b32_e32 v0, 0
372+
; CHECK-NEXT: v_mov_b32_e32 v1, 0
369373
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
370374
; CHECK-NEXT: s_load_dword s6, s[0:1], 0x0
371375
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
372-
; CHECK-NEXT: v_mov_b32_e32 v1, s6
376+
; CHECK-NEXT: v_mov_b32_e32 v0, s6
373377
; CHECK-NEXT: .LBB15_1: ; %atomicrmw.start
374378
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
375-
; CHECK-NEXT: v_mov_b32_e32 v3, v1
379+
; CHECK-NEXT: v_mov_b32_e32 v3, v0
376380
; CHECK-NEXT: v_add_f32_e32 v2, -1.0, v3
377-
; CHECK-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] glc
381+
; CHECK-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc
378382
; CHECK-NEXT: s_waitcnt vmcnt(0)
379-
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
383+
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
380384
; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
381385
; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5]
382386
; CHECK-NEXT: s_cbranch_execnz .LBB15_1
383387
; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end
384388
; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
385-
; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v1
389+
; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v0
390+
; CHECK-NEXT: v_mov_b32_e32 v0, s2
391+
; CHECK-NEXT: v_mov_b32_e32 v1, s3
392+
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
386393
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
387-
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[2:3]
388394
; CHECK-NEXT: global_store_dword v[0:1], v2, off
389395
; CHECK-NEXT: s_endpgm
390396
%f32 = atomicrmw fsub float addrspace(1)* %p, float 1.0 monotonic

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -205,13 +205,16 @@ main_body:
205205
ret void
206206
}
207207

208+
@gv = external addrspace(1) global i32
209+
208210
;GCN-LABEL: {{^}}s_buffer_load_index_across_bb:
209211
;GCN-NOT: s_waitcnt;
210212
;GCN: v_or_b32
211213
;GCN: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
212214
define amdgpu_ps void @s_buffer_load_index_across_bb(<4 x i32> inreg %desc, i32 %index) {
213215
main_body:
214216
%tmp = shl i32 %index, 4
217+
store i32 %tmp, i32 addrspace(1)* @gv
215218
br label %bb1
216219

217220
bb1: ; preds = %main_body
@@ -224,10 +227,7 @@ bb1: ; preds = %main_body
224227

225228
;GCN-LABEL: {{^}}s_buffer_load_index_across_bb_merged:
226229
;GCN-NOT: s_waitcnt;
227-
;GCN: v_or_b32
228-
;GCN: v_or_b32
229-
;GCN: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
230-
;GCN: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
230+
;GCN: buffer_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen offset:8
231231
define amdgpu_ps void @s_buffer_load_index_across_bb_merged(<4 x i32> inreg %desc, i32 %index) {
232232
main_body:
233233
%tmp = shl i32 %index, 4

llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919
define amdgpu_kernel void @local_stack_offset_uses_sp(i64 addrspace(1)* %out) {
2020
; MUBUF-LABEL: local_stack_offset_uses_sp:
2121
; MUBUF: ; %bb.0: ; %entry
22-
; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
2322
; MUBUF-NEXT: s_add_u32 s0, s0, s9
2423
; MUBUF-NEXT: v_mov_b32_e32 v1, 0x3000
2524
; MUBUF-NEXT: s_addc_u32 s1, s1, 0
@@ -48,17 +47,17 @@ define amdgpu_kernel void @local_stack_offset_uses_sp(i64 addrspace(1)* %out) {
4847
; MUBUF-NEXT: s_waitcnt vmcnt(0)
4948
; MUBUF-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen offset:4 glc
5049
; MUBUF-NEXT: s_waitcnt vmcnt(0)
50+
; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
51+
; MUBUF-NEXT: v_mov_b32_e32 v6, 0
5152
; MUBUF-NEXT: v_add_co_u32_e32 v0, vcc, v2, v4
5253
; MUBUF-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v5, vcc
53-
; MUBUF-NEXT: v_mov_b32_e32 v2, 0
5454
; MUBUF-NEXT: s_waitcnt lgkmcnt(0)
55-
; MUBUF-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
55+
; MUBUF-NEXT: global_store_dwordx2 v6, v[0:1], s[4:5]
5656
; MUBUF-NEXT: s_waitcnt vmcnt(0)
5757
; MUBUF-NEXT: s_endpgm
5858
;
5959
; FLATSCR-LABEL: local_stack_offset_uses_sp:
6060
; FLATSCR: ; %bb.0: ; %entry
61-
; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
6261
; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s2, s5
6362
; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
6463
; FLATSCR-NEXT: v_mov_b32_e32 v0, 0
@@ -82,11 +81,12 @@ define amdgpu_kernel void @local_stack_offset_uses_sp(i64 addrspace(1)* %out) {
8281
; FLATSCR-NEXT: s_movk_i32 s2, 0x3000
8382
; FLATSCR-NEXT: scratch_load_dwordx2 v[2:3], off, s2 offset:64 glc
8483
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
84+
; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
85+
; FLATSCR-NEXT: v_mov_b32_e32 v4, 0
8586
; FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
8687
; FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
87-
; FLATSCR-NEXT: v_mov_b32_e32 v2, 0
8888
; FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
89-
; FLATSCR-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
89+
; FLATSCR-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
9090
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
9191
; FLATSCR-NEXT: s_endpgm
9292
entry:
@@ -203,7 +203,6 @@ entry:
203203
define amdgpu_kernel void @local_stack_offset_uses_sp_flat(<3 x i64> addrspace(1)* %out) {
204204
; MUBUF-LABEL: local_stack_offset_uses_sp_flat:
205205
; MUBUF: ; %bb.0: ; %entry
206-
; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
207206
; MUBUF-NEXT: s_add_u32 s0, s0, s9
208207
; MUBUF-NEXT: s_addc_u32 s1, s1, 0
209208
; MUBUF-NEXT: v_mov_b32_e32 v0, 0x4000
@@ -239,9 +238,10 @@ define amdgpu_kernel void @local_stack_offset_uses_sp_flat(<3 x i64> addrspace(1
239238
; MUBUF-NEXT: v_mov_b32_e32 v13, 0x4000
240239
; MUBUF-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen glc
241240
; MUBUF-NEXT: s_waitcnt vmcnt(0)
242-
; MUBUF-NEXT: v_mov_b32_e32 v12, 0
241+
; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
243242
; MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen glc
244243
; MUBUF-NEXT: s_waitcnt vmcnt(0)
244+
; MUBUF-NEXT: v_mov_b32_e32 v12, 0
245245
; MUBUF-NEXT: buffer_load_dword v8, v13, s[0:3], 0 offen glc
246246
; MUBUF-NEXT: s_waitcnt vmcnt(0)
247247
; MUBUF-NEXT: v_mov_b32_e32 v13, 0x4000
@@ -274,7 +274,6 @@ define amdgpu_kernel void @local_stack_offset_uses_sp_flat(<3 x i64> addrspace(1
274274
;
275275
; FLATSCR-LABEL: local_stack_offset_uses_sp_flat:
276276
; FLATSCR: ; %bb.0: ; %entry
277-
; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
278277
; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s2, s5
279278
; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
280279
; FLATSCR-NEXT: v_mov_b32_e32 v0, 0
@@ -303,6 +302,7 @@ define amdgpu_kernel void @local_stack_offset_uses_sp_flat(<3 x i64> addrspace(1
303302
; FLATSCR-NEXT: s_movk_i32 s2, 0x2000
304303
; FLATSCR-NEXT: scratch_load_dwordx4 v[4:7], off, s2 glc
305304
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
305+
; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
306306
; FLATSCR-NEXT: v_mov_b32_e32 v12, 0
307307
; FLATSCR-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6
308308
; FLATSCR-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v7, vcc

llvm/test/CodeGen/AMDGPU/operand-folding.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ define amdgpu_kernel void @no_fold_tied_subregister() #1 {
126126

127127
; There should be exact one folding on the same operand.
128128
; CHECK-LABEL: {{^}}no_extra_fold_on_same_opnd
129-
; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
129+
; CHECK-NOT: %bb.1:
130130
; CHECK: v_xor_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
131131
define void @no_extra_fold_on_same_opnd() #1 {
132132
entry:

0 commit comments

Comments
 (0)