Skip to content

Commit e09f98a

Browse files
committed
AMDGPU: Fix LiveVariables error after optimizing VGPR ranges
This was not removing the block from the live set depending on the specific depth first visit order. Fixes a verifier error in the OpenCL conformance tests.
1 parent 8100426 commit e09f98a

File tree

3 files changed

+254
-23
lines changed

3 files changed

+254
-23
lines changed

llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp

Lines changed: 18 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -371,47 +371,42 @@ void SIOptimizeVGPRLiveRange::collectWaterfallCandidateRegisters(
371371
// Re-calculate the liveness of \p Reg in the THEN-region
372372
void SIOptimizeVGPRLiveRange::updateLiveRangeInThenRegion(
373373
Register Reg, MachineBasicBlock *If, MachineBasicBlock *Flow) const {
374-
375-
SmallPtrSet<MachineBasicBlock *, 16> PHIIncoming;
376-
377-
MachineBasicBlock *ThenEntry = nullptr;
378-
for (auto *Succ : If->successors()) {
379-
if (Succ != Flow) {
380-
ThenEntry = Succ;
381-
break;
374+
SetVector<MachineBasicBlock *> Blocks;
375+
SmallVector<MachineBasicBlock *> WorkList({If});
376+
377+
// Collect all successors until we see the flow block, where we should
378+
// reconverge.
379+
while (!WorkList.empty()) {
380+
auto *MBB = WorkList.pop_back_val();
381+
for (auto *Succ : MBB->successors()) {
382+
if (Succ != Flow && !Blocks.contains(Succ)) {
383+
WorkList.push_back(Succ);
384+
Blocks.insert(Succ);
385+
}
382386
}
383387
}
384-
assert(ThenEntry && "No successor in Then region?");
385388

386389
LiveVariables::VarInfo &OldVarInfo = LV->getVarInfo(Reg);
387-
df_iterator_default_set<MachineBasicBlock *, 16> Visited;
388-
389-
for (MachineBasicBlock *MBB : depth_first_ext(ThenEntry, Visited)) {
390-
if (MBB == Flow)
391-
break;
392-
390+
for (MachineBasicBlock *MBB : Blocks) {
393391
// Clear Live bit, as we will recalculate afterwards
394392
LLVM_DEBUG(dbgs() << "Clear AliveBlock " << printMBBReference(*MBB)
395393
<< '\n');
396394
OldVarInfo.AliveBlocks.reset(MBB->getNumber());
397395
}
398396

397+
SmallPtrSet<MachineBasicBlock *, 4> PHIIncoming;
398+
399399
// Get the blocks the Reg should be alive through
400400
for (auto I = MRI->use_nodbg_begin(Reg), E = MRI->use_nodbg_end(); I != E;
401401
++I) {
402402
auto *UseMI = I->getParent();
403403
if (UseMI->isPHI() && I->readsReg()) {
404-
if (Visited.contains(UseMI->getParent()))
404+
if (Blocks.contains(UseMI->getParent()))
405405
PHIIncoming.insert(UseMI->getOperand(I.getOperandNo() + 1).getMBB());
406406
}
407407
}
408408

409-
Visited.clear();
410-
411-
for (MachineBasicBlock *MBB : depth_first_ext(ThenEntry, Visited)) {
412-
if (MBB == Flow)
413-
break;
414-
409+
for (MachineBasicBlock *MBB : Blocks) {
415410
SmallVector<MachineInstr *> Uses;
416411
// PHI instructions has been processed before.
417412
findNonPHIUsesInBlock(Reg, MBB, Uses);
@@ -438,7 +433,7 @@ void SIOptimizeVGPRLiveRange::updateLiveRangeInThenRegion(
438433

439434
// Set the isKilled flag if we get new Kills in the THEN region.
440435
for (auto *MI : OldVarInfo.Kills) {
441-
if (Visited.contains(MI->getParent()))
436+
if (Blocks.contains(MI->getParent()))
442437
MI->addRegisterKilled(Reg, TRI);
443438
}
444439
}
Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
2+
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 -verify-machineinstrs -start-after=unreachable-mbb-elimination -stop-after=phi-node-elimination -o - %s | FileCheck %s
3+
4+
# FIXME: Should be able to just use run-pass, but need to keep
5+
# LiveVariables live after for the verifier. Also -start-before
6+
# doesn't work here for some reason.
7+
8+
# LiveVariables needs to remove %bb.3 from the live blocks for %1
9+
# after the phi is introduced, but was previously missed due to
10+
# encountering the flow block first in the depth first search.
11+
12+
---
13+
name: live_variable_update
14+
tracksRegLiveness: true
15+
body: |
16+
; CHECK-LABEL: name: live_variable_update
17+
; CHECK: bb.0:
18+
; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.5(0x40000000)
19+
; CHECK-NEXT: liveins: $vgpr0, $sgpr4_sgpr5
20+
; CHECK-NEXT: {{ $}}
21+
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY killed $sgpr4_sgpr5
22+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0
23+
; CHECK-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 0, [[COPY1]], implicit $exec
24+
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY killed [[COPY1]]
25+
; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $exec_lo, implicit-def $exec_lo
26+
; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY3]], killed [[V_CMP_NE_U32_e64_]], implicit-def dead $scc
27+
; CHECK-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[S_AND_B32_]], [[COPY3]], implicit-def dead $scc
28+
; CHECK-NEXT: $exec_lo = S_MOV_B32_term killed [[S_AND_B32_]]
29+
; CHECK-NEXT: S_CBRANCH_EXECZ %bb.5, implicit $exec
30+
; CHECK-NEXT: S_BRANCH %bb.2
31+
; CHECK-NEXT: {{ $}}
32+
; CHECK-NEXT: bb.1:
33+
; CHECK-NEXT: successors: %bb.7(0x80000000)
34+
; CHECK-NEXT: {{ $}}
35+
; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[COPY]], 0, 0 :: (dereferenceable invariant load (s64), align 16, addrspace 4)
36+
; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, killed %15, 0, implicit $exec
37+
; CHECK-NEXT: %7:vgpr_32, dead %8:sreg_32_xm0_xexec = V_ADDC_U32_e64 0, killed [[S_LOAD_DWORDX2_IMM]].sub1, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
38+
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_ADD_CO_U32_e64_]], %subreg.sub0, killed %7, %subreg.sub1
39+
; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
40+
; CHECK-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
41+
; CHECK-NEXT: GLOBAL_STORE_BYTE killed [[V_MOV_B]], killed [[GLOBAL_LOAD_UBYTE]], 0, 0, implicit $exec :: (store (s8), addrspace 1)
42+
; CHECK-NEXT: S_BRANCH %bb.7
43+
; CHECK-NEXT: {{ $}}
44+
; CHECK-NEXT: bb.2:
45+
; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.3(0x40000000)
46+
; CHECK-NEXT: {{ $}}
47+
; CHECK-NEXT: S_CBRANCH_SCC0 %bb.4, implicit undef $scc
48+
; CHECK-NEXT: {{ $}}
49+
; CHECK-NEXT: bb.3:
50+
; CHECK-NEXT: successors: %bb.6(0x80000000)
51+
; CHECK-NEXT: {{ $}}
52+
; CHECK-NEXT: S_BRANCH %bb.6
53+
; CHECK-NEXT: {{ $}}
54+
; CHECK-NEXT: bb.4:
55+
; CHECK-NEXT: successors: %bb.6(0x80000000)
56+
; CHECK-NEXT: {{ $}}
57+
; CHECK-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
58+
; CHECK-NEXT: dead %13:vgpr_32 = GLOBAL_LOAD_UBYTE killed [[V_MOV_B1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
59+
; CHECK-NEXT: S_BRANCH %bb.6
60+
; CHECK-NEXT: {{ $}}
61+
; CHECK-NEXT: bb.5:
62+
; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.7(0x40000000)
63+
; CHECK-NEXT: {{ $}}
64+
; CHECK-NEXT: [[S_OR_SAVEEXEC_B32_:%[0-9]+]]:sreg_32 = S_OR_SAVEEXEC_B32 killed [[S_XOR_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
65+
; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY killed [[COPY2]]
66+
; CHECK-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 $exec_lo, [[S_OR_SAVEEXEC_B32_]], implicit-def $scc
67+
; CHECK-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_B32_1]], implicit-def $scc
68+
; CHECK-NEXT: S_CBRANCH_EXECZ %bb.7, implicit $exec
69+
; CHECK-NEXT: S_BRANCH %bb.1
70+
; CHECK-NEXT: {{ $}}
71+
; CHECK-NEXT: bb.6:
72+
; CHECK-NEXT: successors: %bb.5(0x80000000)
73+
; CHECK-NEXT: {{ $}}
74+
; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
75+
; CHECK-NEXT: S_BRANCH %bb.5
76+
; CHECK-NEXT: {{ $}}
77+
; CHECK-NEXT: bb.7:
78+
; CHECK-NEXT: $exec_lo = S_OR_B32 $exec_lo, killed [[S_AND_B32_1]], implicit-def $scc
79+
; CHECK-NEXT: S_ENDPGM 0
80+
bb.0:
81+
successors: %bb.2(0x40000000), %bb.5(0x40000000)
82+
liveins: $vgpr0, $sgpr4_sgpr5
83+
84+
%0:sgpr_64 = COPY $sgpr4_sgpr5
85+
%1:vgpr_32 = COPY $vgpr0
86+
%2:sreg_32 = V_CMP_NE_U32_e64 0, %1, implicit $exec
87+
%3:sreg_32 = SI_IF killed %2, %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
88+
S_BRANCH %bb.2
89+
90+
bb.1:
91+
successors: %bb.7(0x80000000)
92+
93+
%4:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 0, 0 :: (dereferenceable invariant load (s64), align 16, addrspace 4)
94+
%5:vgpr_32, %6:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 %4.sub0, %1, 0, implicit $exec
95+
%7:vgpr_32, dead %8:sreg_32_xm0_xexec = V_ADDC_U32_e64 0, %4.sub1, killed %6, 0, implicit $exec
96+
%9:vreg_64 = REG_SEQUENCE %5, %subreg.sub0, %7, %subreg.sub1
97+
%10:vgpr_32 = GLOBAL_LOAD_UBYTE killed %9, 0, 0, implicit $exec :: (load (s8), addrspace 1)
98+
%11:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
99+
GLOBAL_STORE_BYTE killed %11, killed %10, 0, 0, implicit $exec :: (store (s8), addrspace 1)
100+
S_BRANCH %bb.7
101+
102+
bb.2:
103+
successors: %bb.4(0x40000000), %bb.3(0x40000000)
104+
105+
S_CBRANCH_SCC0 %bb.4, implicit undef $scc
106+
107+
bb.3:
108+
successors: %bb.6(0x80000000)
109+
110+
S_BRANCH %bb.6
111+
112+
bb.4:
113+
successors: %bb.6(0x80000000)
114+
115+
%12:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
116+
%13:vgpr_32 = GLOBAL_LOAD_UBYTE killed %12, 0, 0, implicit $exec :: (load (s8), addrspace 1)
117+
S_BRANCH %bb.6
118+
119+
bb.5:
120+
successors: %bb.1(0x40000000), %bb.7(0x40000000)
121+
122+
%14:sreg_32 = SI_ELSE %3, %bb.7, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
123+
S_BRANCH %bb.1
124+
125+
bb.6:
126+
successors: %bb.5(0x80000000)
127+
128+
S_BRANCH %bb.5
129+
130+
bb.7:
131+
SI_END_CF %14, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
132+
S_ENDPGM 0
133+
134+
...

llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -443,4 +443,106 @@ end:
443443
ret float %r2
444444
}
445445

446+
define amdgpu_kernel void @livevariables_update_missed_block(i8 addrspace(1)* %src1) {
447+
; SI-LABEL: name: livevariables_update_missed_block
448+
; SI: bb.0.entry:
449+
; SI-NEXT: successors: %bb.2(0x40000000), %bb.5(0x40000000)
450+
; SI-NEXT: liveins: $vgpr0, $sgpr0_sgpr1
451+
; SI-NEXT: {{ $}}
452+
; SI-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY killed $sgpr0_sgpr1
453+
; SI-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY killed $vgpr0
454+
; SI-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 0, [[COPY1]](s32), implicit $exec
455+
; SI-NEXT: [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_NE_U32_e64_]], %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
456+
; SI-NEXT: S_BRANCH %bb.2
457+
; SI-NEXT: {{ $}}
458+
; SI-NEXT: bb.1.if.then:
459+
; SI-NEXT: successors: %bb.7(0x80000000)
460+
; SI-NEXT: {{ $}}
461+
; SI-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.src1.kernarg.offset.cast, align 4, addrspace 4)
462+
; SI-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, killed %50, 0, implicit $exec
463+
; SI-NEXT: %43:vgpr_32, dead %45:sreg_32_xm0_xexec = V_ADDC_U32_e64 0, killed [[S_LOAD_DWORDX2_IMM]].sub1, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
464+
; SI-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_ADD_CO_U32_e64_]], %subreg.sub0, killed %43, %subreg.sub1
465+
; SI-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8) from %ir.i10, addrspace 1)
466+
; SI-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
467+
; SI-NEXT: GLOBAL_STORE_BYTE killed [[V_MOV_B]], killed [[GLOBAL_LOAD_UBYTE]], 0, 0, implicit $exec :: (store (s8) into `i8 addrspace(1)* null`, addrspace 1)
468+
; SI-NEXT: S_BRANCH %bb.7
469+
; SI-NEXT: {{ $}}
470+
; SI-NEXT: bb.2.if.then9:
471+
; SI-NEXT: successors: %bb.4(0x40000000), %bb.3(0x40000000)
472+
; SI-NEXT: {{ $}}
473+
; SI-NEXT: S_CBRANCH_SCC0 %bb.4, implicit undef $scc
474+
; SI-NEXT: {{ $}}
475+
; SI-NEXT: bb.3:
476+
; SI-NEXT: successors: %bb.6(0x80000000)
477+
; SI-NEXT: {{ $}}
478+
; SI-NEXT: S_BRANCH %bb.6
479+
; SI-NEXT: {{ $}}
480+
; SI-NEXT: bb.4.sw.bb:
481+
; SI-NEXT: successors: %bb.6(0x80000000)
482+
; SI-NEXT: {{ $}}
483+
; SI-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
484+
; SI-NEXT: [[GLOBAL_LOAD_UBYTE1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE killed [[V_MOV_B1]], 0, 0, implicit $exec :: (load (s8) from `i8 addrspace(1)* null`, addrspace 1)
485+
; SI-NEXT: S_BRANCH %bb.6
486+
; SI-NEXT: {{ $}}
487+
; SI-NEXT: bb.5.Flow:
488+
; SI-NEXT: successors: %bb.1(0x40000000), %bb.7(0x40000000)
489+
; SI-NEXT: {{ $}}
490+
; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[COPY1]](s32), %bb.0, undef %51:vgpr_32, %bb.6
491+
; SI-NEXT: [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.7, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
492+
; SI-NEXT: S_BRANCH %bb.1
493+
; SI-NEXT: {{ $}}
494+
; SI-NEXT: bb.6.sw.bb18:
495+
; SI-NEXT: successors: %bb.5(0x80000000)
496+
; SI-NEXT: {{ $}}
497+
; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI undef %39:vgpr_32, %bb.3, [[GLOBAL_LOAD_UBYTE1]], %bb.4
498+
; SI-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
499+
; SI-NEXT: GLOBAL_STORE_BYTE killed [[V_MOV_B2]], killed [[PHI1]], 0, 0, implicit $exec :: (store (s8) into `i8 addrspace(1)* null`, addrspace 1)
500+
; SI-NEXT: S_BRANCH %bb.5
501+
; SI-NEXT: {{ $}}
502+
; SI-NEXT: bb.7.UnifiedReturnBlock:
503+
; SI-NEXT: SI_END_CF killed [[SI_ELSE]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
504+
; SI-NEXT: S_ENDPGM 0
505+
entry:
506+
%i2 = tail call i32 @llvm.amdgcn.workitem.id.x()
507+
%i4 = add i32 0, %i2
508+
%i5 = zext i32 %i4 to i64
509+
%i6 = add i64 0, %i5
510+
%add = add i64 %i6, 0
511+
%cmp2 = icmp ult i64 %add, 1
512+
br i1 %cmp2, label %if.then, label %if.then9
513+
514+
if.then: ; preds = %entry
515+
%i9 = mul i64 %i6, 1
516+
%i10 = getelementptr inbounds i8, i8 addrspace(1)* %src1, i64 %i9
517+
%i11 = load i8, i8 addrspace(1)* %i10, align 1
518+
%i12 = insertelement <3 x i8> zeroinitializer, i8 %i11, i64 0
519+
%i13 = insertelement <3 x i8> %i12, i8 0, i64 1
520+
%i14 = insertelement <3 x i8> %i13, i8 0, i64 1
521+
%i15 = select <3 x i1> zeroinitializer, <3 x i8> zeroinitializer, <3 x i8> %i14
522+
%i16 = extractelement <3 x i8> %i15, i64 0
523+
store i8 %i16, i8 addrspace(1)* null, align 1
524+
ret void
525+
526+
if.then9: ; preds = %entry
527+
br i1 undef, label %sw.bb18, label %sw.bb
528+
529+
sw.bb: ; preds = %if.then9
530+
%i17 = load i8, i8 addrspace(1)* null, align 1
531+
%i18 = insertelement <4 x i8> zeroinitializer, i8 %i17, i64 0
532+
%a.sroa.0.0.vecblend = shufflevector <4 x i8> %i18, <4 x i8> zeroinitializer, <4 x i32> <i32 0, i32 0, i32 0, i32 undef>
533+
br label %sw.bb18
534+
535+
sw.bb18: ; preds = %sw.bb, %if.then9
536+
%a.sroa.0.0 = phi <4 x i8> [ %a.sroa.0.0.vecblend, %sw.bb ], [ undef, %if.then9 ]
537+
%a.sroa.0.0.vec.extract61 = shufflevector <4 x i8> %a.sroa.0.0, <4 x i8> zeroinitializer, <3 x i32> <i32 undef, i32 1, i32 undef>
538+
%i19 = insertelement <3 x i8> %a.sroa.0.0.vec.extract61, i8 0, i64 0
539+
%i20 = select <3 x i1> zeroinitializer, <3 x i8> zeroinitializer, <3 x i8> %i19
540+
%i21 = extractelement <3 x i8> %i20, i64 1
541+
store i8 %i21, i8 addrspace(1)* null, align 1
542+
ret void
543+
}
544+
545+
declare i32 @llvm.amdgcn.workitem.id.x() #1
546+
446547
attributes #0 = { nounwind }
548+
attributes #1 = { nounwind readnone speculatable willreturn }

0 commit comments

Comments
 (0)