Skip to content

Commit d9bf8ab

Browse files
committed
[AMDGPU] Add MMOs for GFX11 Streamout Instructions
The GFX11 NGG Streamout Instructions perform atomic operations on dedicated registers. At the moment, they lack machine memory operands, which causes the si-memory-legalizer pass to treat them conservatively and introduce several unnecessary waits and cache invalidations. This patch introduces a new address space to represent these special registers and teaches instruction selection to add memory operands with this new address space to DS_ADD/SUB_GS_REG_RTN. Since this address space is meant to be compiler-internal, we move it up a bit from the other address spaces and give it the number 128. According to the LLVM Language Reference, address space numbers can go all the way up to 2^24, but I'm not sure how well this is supported in practice [1], so using a smaller number seems safer. [1] https://github.com/llvm/llvm-project/blob/0107513fe79da7670e37c29c0862794a2213a89c/llvm/utils/TableGen/IntrinsicEmitter.cpp#L401 Differential Revision: https://reviews.llvm.org/D146031
1 parent f8861ea commit d9bf8ab

File tree

6 files changed

+25
-46
lines changed

6 files changed

+25
-46
lines changed

llvm/docs/AMDGPUUsage.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -675,6 +675,7 @@ supported for the ``amdgcn`` target.
675675
Private 5 private scratch 32 0xFFFFFFFF
676676
Constant 32-bit 6 *TODO* 0x00000000
677677
Buffer Fat Pointer (experimental) 7 *TODO*
678+
Streamout Registers 128 N/A GS_REGS
678679
================================= =============== =========== ================ ======= ============================
679680

680681
**Generic**
@@ -783,6 +784,13 @@ supported for the ``amdgcn`` target.
783784
model the buffer descriptors used heavily in graphics workloads targeting
784785
the backend.
785786

787+
**Streamout Registers**
788+
Dedicated registers used by the GS NGG Streamout Instructions. The register
789+
file is modelled as a memory in a distinct address space because it is indexed
790+
by an address-like offset in place of named registers, and because register
791+
accesses affect LGKMcnt. This is an internal address space used only by the
792+
compiler. Do not use this address space for IR pointers.
793+
786794
.. _amdgpu-memory-scopes:
787795

788796
Memory Scopes

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2000,12 +2000,14 @@ def int_amdgcn_permlane64 :
20002000
def int_amdgcn_ds_add_gs_reg_rtn :
20012001
ClangBuiltin<"__builtin_amdgcn_ds_add_gs_reg_rtn">,
20022002
Intrinsic<[llvm_anyint_ty], [llvm_i32_ty, llvm_i32_ty],
2003-
[ImmArg<ArgIndex<1>>, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
2003+
[ImmArg<ArgIndex<1>>, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree],
2004+
"", [SDNPMemOperand]>;
20042005

20052006
def int_amdgcn_ds_sub_gs_reg_rtn :
20062007
ClangBuiltin<"__builtin_amdgcn_ds_sub_gs_reg_rtn">,
20072008
Intrinsic<[llvm_anyint_ty], [llvm_i32_ty, llvm_i32_ty],
2008-
[ImmArg<ArgIndex<1>>, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
2009+
[ImmArg<ArgIndex<1>>, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree],
2010+
"", [SDNPMemOperand]>;
20092011

20102012
def int_amdgcn_ds_bvh_stack_rtn :
20112013
Intrinsic<

llvm/lib/Target/AMDGPU/AMDGPU.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -376,6 +376,10 @@ enum : unsigned {
376376

377377
BUFFER_FAT_POINTER = 7, ///< Address space for 160-bit buffer fat pointers.
378378

379+
/// Internal address spaces. Can be freely renumbered.
380+
STREAMOUT_REGISTER = 128, ///< Address space for GS NGG Streamout registers.
381+
/// end Internal address spaces.
382+
379383
/// Address space for direct addressable parameter memory (CONST0).
380384
PARAM_D_ADDRESS = 6,
381385
/// Address space for indirect addressable parameter memory (VTX1).

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1096,6 +1096,15 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
10961096

10971097
return true;
10981098
}
1099+
case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1100+
case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1101+
Info.opc = ISD::INTRINSIC_W_CHAIN;
1102+
Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1103+
Info.ptrVal = nullptr;
1104+
Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
1105+
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1106+
return true;
1107+
}
10991108
case Intrinsic::amdgcn_ds_append:
11001109
case Intrinsic::amdgcn_ds_consume: {
11011110
Info.opc = ISD::INTRINSIC_W_CHAIN;

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.add.gs.reg.rtn.ll

Lines changed: 0 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,7 @@ declare i64 @llvm.amdgcn.ds.add.gs.reg.rtn.i64(i32, i32 immarg)
88
define amdgpu_gs void @test_add_32(i32 %arg) {
99
; CHECK-LABEL: test_add_32:
1010
; CHECK: ; %bb.0:
11-
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
12-
; CHECK-NEXT: s_waitcnt_vscnt null, 0x0
1311
; CHECK-NEXT: ds_add_gs_reg_rtn v[0:1], v0 offset:16 gds
14-
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
15-
; CHECK-NEXT: s_waitcnt_vscnt null, 0x0
16-
; CHECK-NEXT: buffer_gl0_inv
17-
; CHECK-NEXT: buffer_gl1_inv
1812
; CHECK-NEXT: s_endpgm
1913
%unused = call i32 @llvm.amdgcn.ds.add.gs.reg.rtn.i32(i32 %arg, i32 16)
2014
ret void
@@ -23,13 +17,8 @@ define amdgpu_gs void @test_add_32(i32 %arg) {
2317
define amdgpu_gs void @test_add_32_use(i32 %arg, ptr addrspace(1) %out) {
2418
; CHECK-LABEL: test_add_32_use:
2519
; CHECK: ; %bb.0:
26-
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
27-
; CHECK-NEXT: s_waitcnt_vscnt null, 0x0
2820
; CHECK-NEXT: ds_add_gs_reg_rtn v[3:4], v0 offset:16 gds
2921
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
30-
; CHECK-NEXT: s_waitcnt_vscnt null, 0x0
31-
; CHECK-NEXT: buffer_gl0_inv
32-
; CHECK-NEXT: buffer_gl1_inv
3322
; CHECK-NEXT: global_store_b32 v[1:2], v3, off
3423
; CHECK-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3524
; CHECK-NEXT: s_endpgm
@@ -41,13 +30,7 @@ define amdgpu_gs void @test_add_32_use(i32 %arg, ptr addrspace(1) %out) {
4130
define amdgpu_gs void @test_add_64(i32 %arg) {
4231
; CHECK-LABEL: test_add_64:
4332
; CHECK: ; %bb.0:
44-
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
45-
; CHECK-NEXT: s_waitcnt_vscnt null, 0x0
4633
; CHECK-NEXT: ds_add_gs_reg_rtn v[0:1], v0 offset:32 gds
47-
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
48-
; CHECK-NEXT: s_waitcnt_vscnt null, 0x0
49-
; CHECK-NEXT: buffer_gl0_inv
50-
; CHECK-NEXT: buffer_gl1_inv
5134
; CHECK-NEXT: s_endpgm
5235
%unused = call i64 @llvm.amdgcn.ds.add.gs.reg.rtn.i64(i32 %arg, i32 32)
5336
ret void
@@ -56,13 +39,8 @@ define amdgpu_gs void @test_add_64(i32 %arg) {
5639
define amdgpu_gs void @test_add_64_use(i32 %arg, ptr addrspace(1) %out) {
5740
; CHECK-LABEL: test_add_64_use:
5841
; CHECK: ; %bb.0:
59-
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
60-
; CHECK-NEXT: s_waitcnt_vscnt null, 0x0
6142
; CHECK-NEXT: ds_add_gs_reg_rtn v[3:4], v0 offset:32 gds
6243
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
63-
; CHECK-NEXT: s_waitcnt_vscnt null, 0x0
64-
; CHECK-NEXT: buffer_gl0_inv
65-
; CHECK-NEXT: buffer_gl1_inv
6644
; CHECK-NEXT: global_store_b64 v[1:2], v[3:4], off
6745
; CHECK-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
6846
; CHECK-NEXT: s_endpgm

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.sub.gs.reg.rtn.ll

Lines changed: 0 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,7 @@ declare i64 @llvm.amdgcn.ds.sub.gs.reg.rtn.i64(i32, i32 immarg)
88
define amdgpu_gs void @test_sub_32(i32 %arg) {
99
; CHECK-LABEL: test_sub_32:
1010
; CHECK: ; %bb.0:
11-
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
12-
; CHECK-NEXT: s_waitcnt_vscnt null, 0x0
1311
; CHECK-NEXT: ds_sub_gs_reg_rtn v[0:1], v0 offset:16 gds
14-
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
15-
; CHECK-NEXT: s_waitcnt_vscnt null, 0x0
16-
; CHECK-NEXT: buffer_gl0_inv
17-
; CHECK-NEXT: buffer_gl1_inv
1812
; CHECK-NEXT: s_endpgm
1913
%unused = call i32 @llvm.amdgcn.ds.sub.gs.reg.rtn.i32(i32 %arg, i32 16)
2014
ret void
@@ -23,13 +17,8 @@ define amdgpu_gs void @test_sub_32(i32 %arg) {
2317
define amdgpu_gs void @test_sub_32_use(i32 %arg, ptr addrspace(1) %out) {
2418
; CHECK-LABEL: test_sub_32_use:
2519
; CHECK: ; %bb.0:
26-
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
27-
; CHECK-NEXT: s_waitcnt_vscnt null, 0x0
2820
; CHECK-NEXT: ds_sub_gs_reg_rtn v[3:4], v0 offset:16 gds
2921
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
30-
; CHECK-NEXT: s_waitcnt_vscnt null, 0x0
31-
; CHECK-NEXT: buffer_gl0_inv
32-
; CHECK-NEXT: buffer_gl1_inv
3322
; CHECK-NEXT: global_store_b32 v[1:2], v3, off
3423
; CHECK-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3524
; CHECK-NEXT: s_endpgm
@@ -41,13 +30,7 @@ define amdgpu_gs void @test_sub_32_use(i32 %arg, ptr addrspace(1) %out) {
4130
define amdgpu_gs void @test_sub_64(i32 %arg) {
4231
; CHECK-LABEL: test_sub_64:
4332
; CHECK: ; %bb.0:
44-
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
45-
; CHECK-NEXT: s_waitcnt_vscnt null, 0x0
4633
; CHECK-NEXT: ds_sub_gs_reg_rtn v[0:1], v0 offset:32 gds
47-
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
48-
; CHECK-NEXT: s_waitcnt_vscnt null, 0x0
49-
; CHECK-NEXT: buffer_gl0_inv
50-
; CHECK-NEXT: buffer_gl1_inv
5134
; CHECK-NEXT: s_endpgm
5235
%unused = call i64 @llvm.amdgcn.ds.sub.gs.reg.rtn.i64(i32 %arg, i32 32)
5336
ret void
@@ -56,13 +39,8 @@ define amdgpu_gs void @test_sub_64(i32 %arg) {
5639
define amdgpu_gs void @test_sub_64_use(i32 %arg, ptr addrspace(1) %out) {
5740
; CHECK-LABEL: test_sub_64_use:
5841
; CHECK: ; %bb.0:
59-
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
60-
; CHECK-NEXT: s_waitcnt_vscnt null, 0x0
6142
; CHECK-NEXT: ds_sub_gs_reg_rtn v[3:4], v0 offset:32 gds
6243
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
63-
; CHECK-NEXT: s_waitcnt_vscnt null, 0x0
64-
; CHECK-NEXT: buffer_gl0_inv
65-
; CHECK-NEXT: buffer_gl1_inv
6644
; CHECK-NEXT: global_store_b64 v[1:2], v[3:4], off
6745
; CHECK-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
6846
; CHECK-NEXT: s_endpgm

0 commit comments

Comments
 (0)