Skip to content

Commit 6e30f37

Browse files
committed
[AMDGPU] always emit a soft wait even if it is trivially ~0
The memory legalizer is currently responsible for emitting wait instructions at ordering operations such as acquire and release. It tries to be efficient by emitting waits only when required. In particular, it does not emit a wait on vmcnt at workgroup scope since that ordering is already guaranteed by the architecture. But this is now incorrect because direct loads to LDS have an LDS component which needs explicit ordering on vmcnt. But it is inefficient to always emit a wait on vmcnt since majority of the programs do not use direct loads to LDS, and this will affect all workgroup scope operations. As a first step to that, the memory legalizer now emits a soft wait instruction even if all counts are trivially ~0. This is a placeholder that the SIInsertWaitcnts pass will either optimize away or strenghthen based on its analysis of whether direct loads to LDS are pending at this point in the program.
1 parent adbd8e7 commit 6e30f37

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+13845
-763
lines changed

llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp

Lines changed: 25 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1074,8 +1074,6 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
10741074
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
10751075
bool IsCrossAddrSpaceOrdering, Position Pos,
10761076
AtomicOrdering Order) const {
1077-
bool Changed = false;
1078-
10791077
MachineBasicBlock &MBB = *MI->getParent();
10801078
DebugLoc DL = MI->getDebugLoc();
10811079

@@ -1149,21 +1147,19 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
11491147
}
11501148
}
11511149

1152-
if (VMCnt || LGKMCnt) {
1153-
unsigned WaitCntImmediate =
1154-
AMDGPU::encodeWaitcnt(IV,
1155-
VMCnt ? 0 : getVmcntBitMask(IV),
1156-
getExpcntBitMask(IV),
1157-
LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1158-
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
1159-
.addImm(WaitCntImmediate);
1160-
Changed = true;
1161-
}
1150+
// Always emit a soft wait count, even if it is trivially ~0. SIInsertWaitcnts
1151+
// will later use this marker to add additional waits such as those required
1152+
// from direct load to LDS (formerly known as LDS DMA).
1153+
unsigned WaitCntImmediate = AMDGPU::encodeWaitcnt(
1154+
IV, VMCnt ? 0 : getVmcntBitMask(IV), getExpcntBitMask(IV),
1155+
LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1156+
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
1157+
.addImm(WaitCntImmediate);
11621158

11631159
if (Pos == Position::AFTER)
11641160
--MI;
11651161

1166-
return Changed;
1162+
return true;
11671163
}
11681164

11691165
bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
@@ -1966,8 +1962,6 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
19661962
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
19671963
bool IsCrossAddrSpaceOrdering,
19681964
Position Pos, AtomicOrdering Order) const {
1969-
bool Changed = false;
1970-
19711965
MachineBasicBlock &MBB = *MI->getParent();
19721966
DebugLoc DL = MI->getDebugLoc();
19731967

@@ -2057,28 +2051,25 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
20572051
}
20582052
}
20592053

2060-
if (VMCnt || LGKMCnt) {
2061-
unsigned WaitCntImmediate =
2062-
AMDGPU::encodeWaitcnt(IV,
2063-
VMCnt ? 0 : getVmcntBitMask(IV),
2064-
getExpcntBitMask(IV),
2065-
LGKMCnt ? 0 : getLgkmcntBitMask(IV));
2066-
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
2067-
.addImm(WaitCntImmediate);
2068-
Changed = true;
2069-
}
2054+
// Always emit a soft wait count, even if it is trivially ~0. SIInsertWaitcnts
2055+
// will later use this marker to add additional waits such as those required
2056+
// from direct load to LDS (formerly known as LDS DMA).
2057+
unsigned WaitCntImmediate = AMDGPU::encodeWaitcnt(
2058+
IV, VMCnt ? 0 : getVmcntBitMask(IV), getExpcntBitMask(IV),
2059+
LGKMCnt ? 0 : getLgkmcntBitMask(IV));
2060+
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
2061+
.addImm(WaitCntImmediate);
20702062

20712063
if (VSCnt) {
20722064
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft))
20732065
.addReg(AMDGPU::SGPR_NULL, RegState::Undef)
20742066
.addImm(0);
2075-
Changed = true;
20762067
}
20772068

20782069
if (Pos == Position::AFTER)
20792070
--MI;
20802071

2081-
return Changed;
2072+
return true;
20822073
}
20832074

20842075
bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
@@ -2287,8 +2278,6 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
22872278
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
22882279
bool IsCrossAddrSpaceOrdering,
22892280
Position Pos, AtomicOrdering Order) const {
2290-
bool Changed = false;
2291-
22922281
MachineBasicBlock &MBB = *MI->getParent();
22932282
DebugLoc DL = MI->getDebugLoc();
22942283

@@ -2372,23 +2361,26 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
23722361
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(0);
23732362
}
23742363
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_soft)).addImm(0);
2375-
Changed = true;
2364+
} else {
2365+
// Always emit a soft wait count, even if it is trivially ~0.
2366+
// SIInsertWaitcnts will later use this marker to add additional waits such
2367+
// as those required from direct load to LDS (formerly known as LDS DMA).
2368+
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_soft))
2369+
.addImm(getLoadcntBitMask(IV));
23762370
}
23772371

23782372
if (STORECnt) {
23792373
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_STORECNT_soft)).addImm(0);
2380-
Changed = true;
23812374
}
23822375

23832376
if (DSCnt) {
23842377
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_DSCNT_soft)).addImm(0);
2385-
Changed = true;
23862378
}
23872379

23882380
if (Pos == Position::AFTER)
23892381
--MI;
23902382

2391-
return Changed;
2383+
return true;
23922384
}
23932385

23942386
bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,

llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -880,8 +880,8 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel(i32 %n) {
880880
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1
881881
; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x100, v0
882882
; GFX10-NEXT: scratch_store_dword v0, v2, off offset:128
883-
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
884883
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
884+
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
885885
; GFX10-NEXT: s_lshl_b32 s0, s0, 7
886886
; GFX10-NEXT: s_add_u32 s0, 0x100, s0
887887
; GFX10-NEXT: v_add_nc_u32_e32 v1, s0, v1
@@ -921,8 +921,8 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel(i32 %n) {
921921
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
922922
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1
923923
; GFX11-NEXT: scratch_store_b32 v0, v2, off offset:384 dlc
924-
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
925924
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
925+
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
926926
; GFX11-NEXT: s_lshl_b32 s0, s0, 7
927927
; GFX11-NEXT: s_add_u32 s0, 0x100, s0
928928
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -991,8 +991,8 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel(i32 %n) {
991991
; UNALIGNED_GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1
992992
; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v0, 0x100, v0
993993
; UNALIGNED_GFX10-NEXT: scratch_store_dword v0, v2, off offset:128
994-
; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0
995994
; UNALIGNED_GFX10-NEXT: s_waitcnt lgkmcnt(0)
995+
; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0
996996
; UNALIGNED_GFX10-NEXT: s_lshl_b32 s0, s0, 7
997997
; UNALIGNED_GFX10-NEXT: s_add_u32 s0, 0x100, s0
998998
; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v1, s0, v1
@@ -1032,8 +1032,8 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel(i32 %n) {
10321032
; UNALIGNED_GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
10331033
; UNALIGNED_GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1
10341034
; UNALIGNED_GFX11-NEXT: scratch_store_b32 v0, v2, off offset:384 dlc
1035-
; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0
10361035
; UNALIGNED_GFX11-NEXT: s_waitcnt lgkmcnt(0)
1036+
; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0
10371037
; UNALIGNED_GFX11-NEXT: s_lshl_b32 s0, s0, 7
10381038
; UNALIGNED_GFX11-NEXT: s_add_u32 s0, 0x100, s0
10391039
; UNALIGNED_GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -1520,8 +1520,8 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel(i32 %n) {
15201520
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1
15211521
; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x4004, v0
15221522
; GFX10-NEXT: scratch_store_dword v0, v2, off offset:128
1523-
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
15241523
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1524+
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
15251525
; GFX10-NEXT: s_lshl_b32 s0, s0, 7
15261526
; GFX10-NEXT: s_add_u32 s0, 0x4004, s0
15271527
; GFX10-NEXT: v_add_nc_u32_e32 v1, s0, v1
@@ -1633,8 +1633,8 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel(i32 %n) {
16331633
; UNALIGNED_GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1
16341634
; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v0, 0x4004, v0
16351635
; UNALIGNED_GFX10-NEXT: scratch_store_dword v0, v2, off offset:128
1636-
; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0
16371636
; UNALIGNED_GFX10-NEXT: s_waitcnt lgkmcnt(0)
1637+
; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0
16381638
; UNALIGNED_GFX10-NEXT: s_lshl_b32 s0, s0, 7
16391639
; UNALIGNED_GFX10-NEXT: s_add_u32 s0, 0x4004, s0
16401640
; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v1, s0, v1

0 commit comments

Comments
 (0)