Skip to content

Commit 51e0240

Browse files
committed
[AMDGPU] Produce waitcounts for LDS DMA
MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS written can be accessed. A load from LDS to VMEM does not need a wait. Differential Revision: https://reviews.llvm.org/D124626
1 parent 5a79364 commit 51e0240

File tree

2 files changed

+109
-3
lines changed

2 files changed

+109
-3
lines changed

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,7 @@ enum RegisterMapping {
122122
AGPR_OFFSET = 256, // Maximum programmable ArchVGPRs across all targets.
123123
SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
124124
NUM_EXTRA_VGPRS = 1, // A reserved slot for DS.
125-
EXTRA_VGPR_LDS = 0, // This is a placeholder the Shader algorithm uses.
125+
EXTRA_VGPR_LDS = 0, // An artificial register to track LDS writes.
126126
NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
127127
};
128128

@@ -496,6 +496,14 @@ void WaitcntBrackets::setExpScore(const MachineInstr *MI,
496496
}
497497
}
498498

499+
// MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS written
500+
// can be accessed. A load from LDS to VMEM does not need a wait.
501+
static bool mayWriteLDSThroughDMA(const MachineInstr &MI) {
502+
return SIInstrInfo::isVALU(MI) &&
503+
(SIInstrInfo::isMUBUF(MI) || SIInstrInfo::isFLAT(MI)) &&
504+
MI.getOpcode() != AMDGPU::BUFFER_STORE_LDS_DWORD;
505+
}
506+
499507
void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
500508
const SIRegisterInfo *TRI,
501509
const MachineRegisterInfo *MRI,
@@ -644,7 +652,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
644652
setRegScore(RegNo, T, CurrScore);
645653
}
646654
}
647-
if (TII->isDS(Inst) && Inst.mayStore()) {
655+
if (Inst.mayStore() && (TII->isDS(Inst) || mayWriteLDSThroughDMA(Inst))) {
648656
setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
649657
}
650658
}
@@ -1089,7 +1097,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
10891097
SLoadAddresses.erase(Ptr);
10901098
}
10911099
unsigned AS = Memop->getAddrSpace();
1092-
if (AS != AMDGPUAS::LOCAL_ADDRESS)
1100+
if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::FLAT_ADDRESS)
10931101
continue;
10941102
unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
10951103
// VM_CNT is only relevant to vgpr or LDS.
Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
# RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GCN %s
2+
3+
# GCN-LABEL: name: buffer_load_dword_lds_ds_read
4+
# GCN: BUFFER_LOAD_DWORD_LDS_IDXEN
5+
# GCN-NEXT: S_WAITCNT 3952
6+
# vmcnt(0)
7+
# GCN-NEXT: DS_READ_B32_gfx9
8+
---
9+
name: buffer_load_dword_lds_ds_read
10+
body: |
11+
bb.0:
12+
$m0 = S_MOV_B32 0
13+
BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `i32 addrspace(1)* undef` + 4), (store (s32) into `i32 addrspace(3)* undef` + 4)
14+
$vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `i32 addrspace(3)* undef`)
15+
S_ENDPGM 0
16+
17+
...
18+
19+
# GCN-LABEL: name: buffer_load_dword_lds_vmcnt_1
20+
# GCN: BUFFER_LOAD_DWORD_LDS_IDXEN
21+
# GCN-NEXT: BUFFER_LOAD_DWORD_IDXEN
22+
# GCN-NEXT: S_WAITCNT 3953
23+
# vmcnt(1)
24+
# GCN-NEXT: DS_READ_B32_gfx9
25+
---
26+
name: buffer_load_dword_lds_vmcnt_1
27+
body: |
28+
bb.0:
29+
$m0 = S_MOV_B32 0
30+
BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `i32 addrspace(1)* undef`), (store (s32) into `i32 addrspace(3)* undef`)
31+
$vgpr10 = BUFFER_LOAD_DWORD_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `i32 addrspace(1)* undef`)
32+
$vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `i32 addrspace(3)* undef`)
33+
S_ENDPGM 0
34+
35+
...
36+
37+
# GCN-LABEL: name: buffer_load_dword_lds_flat_read
38+
# GCN: BUFFER_LOAD_DWORD_LDS_IDXEN
39+
# GCN-NEXT: S_WAITCNT 3952
40+
# vmcnt(0)
41+
# GCN-NEXT: FLAT_LOAD_DWORD
42+
---
43+
name: buffer_load_dword_lds_flat_read
44+
body: |
45+
bb.0:
46+
$m0 = S_MOV_B32 0
47+
BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `i32 addrspace(1)* undef`), (store (s32) into `i32 addrspace(3)* undef`)
48+
$vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`)
49+
50+
S_ENDPGM 0
51+
52+
...
53+
54+
# GCN-LABEL: name: global_load_lds_dword_ds_read
55+
# GCN: GLOBAL_LOAD_LDS_DWORD
56+
# GCN-NEXT: S_WAITCNT 3952
57+
# vmcnt(0)
58+
# GCN-NEXT: DS_READ_B32_gfx9
59+
---
60+
name: global_load_lds_dword_ds_read
61+
body: |
62+
bb.0:
63+
$m0 = S_MOV_B32 0
64+
GLOBAL_LOAD_LDS_DWORD $vgpr0_vgpr1, 4, 0, implicit $exec, implicit $m0 :: (load (s32) from `i32 addrspace(1)* undef` + 4), (store (s32) into `i32 addrspace(3)* undef` + 4)
65+
$vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `i32 addrspace(3)* undef`)
66+
S_ENDPGM 0
67+
68+
...
69+
70+
# GCN-LABEL: name: scratch_load_lds_dword_ds_read
71+
# GCN: SCRATCH_LOAD_LDS_DWORD
72+
# GCN-NEXT: S_WAITCNT 3952
73+
# vmcnt(0)
74+
# GCN-NEXT: DS_READ_B32_gfx9
75+
---
76+
name: scratch_load_lds_dword_ds_read
77+
body: |
78+
bb.0:
79+
$m0 = S_MOV_B32 0
80+
SCRATCH_LOAD_LDS_DWORD $vgpr0, 4, 0, implicit $exec, implicit $m0 :: (load (s32) from `i32 addrspace(5)* undef` + 4), (store (s32) into `i32 addrspace(3)* undef` + 4)
81+
$vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `i32 addrspace(3)* undef`)
82+
S_ENDPGM 0
83+
84+
...
85+
86+
# GCN-LABEL: name: buffer_store_lds_dword_ds_read
87+
# GCN: BUFFER_STORE_LDS_DWORD
88+
# GCN-NEXT: DS_READ_B32_gfx9
89+
---
90+
name: buffer_store_lds_dword_ds_read
91+
body: |
92+
bb.0:
93+
$m0 = S_MOV_B32 0
94+
BUFFER_STORE_LDS_DWORD $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `i32 addrspace(3)* undef` + 4), (store (s32) into `i32 addrspace(1)* undef` + 4)
95+
$vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `i32 addrspace(3)* undef`)
96+
S_ENDPGM 0
97+
98+
...

0 commit comments

Comments
 (0)