Skip to content

Commit 074b802

Browse files
committed
AMDGPU: Fix DAG divergence for implicit function arguments
This should be directly implied from the register class, and there's no need to special case live ins here. This was getting the wrong answer for the queue ptr argument in callable functions, since it's not an explicit IR argument and is always uniform. Fixes not using scalar loads for the aperture in addrspacecast lowering, and any other places that use implicit SGPR arguments.
1 parent 61813b8 commit 074b802

File tree

5 files changed

+58
-92
lines changed

5 files changed

+58
-92
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 10 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -11026,30 +11026,19 @@ bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode * N,
1102611026
case ISD::CopyFromReg:
1102711027
{
1102811028
const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
11029-
const MachineFunction * MF = FLI->MF;
11030-
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
11031-
const MachineRegisterInfo &MRI = MF->getRegInfo();
11032-
const SIRegisterInfo &TRI = ST.getInstrInfo()->getRegisterInfo();
11029+
const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
11030+
const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
1103311031
Register Reg = R->getReg();
11034-
if (Reg.isPhysical())
11035-
return !TRI.isSGPRReg(MRI, Reg);
11036-
11037-
if (MRI.isLiveIn(Reg)) {
11038-
// workitem.id.x workitem.id.y workitem.id.z
11039-
// Any VGPR formal argument is also considered divergent
11040-
if (!TRI.isSGPRReg(MRI, Reg))
11041-
return true;
11042-
// Formal arguments of non-entry functions
11043-
// are conservatively considered divergent
11044-
else if (!AMDGPU::isEntryFunctionCC(FLI->Fn->getCallingConv()))
11045-
return true;
11046-
return false;
11047-
}
11048-
const Value *V = FLI->getValueFromVirtualReg(Reg);
11049-
if (V)
11032+
11033+
// FIXME: Why does this need to consider isLiveIn?
11034+
if (Reg.isPhysical() || MRI.isLiveIn(Reg))
11035+
return !TRI->isSGPRReg(MRI, Reg);
11036+
11037+
if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
1105011038
return KDA->isDivergent(V);
11039+
1105111040
assert(Reg == FLI->DemoteRegister || isCopyFromRegOfInlineAsm(N));
11052-
return !TRI.isSGPRReg(MRI, Reg);
11041+
return !TRI->isSGPRReg(MRI, Reg);
1105311042
}
1105411043
break;
1105511044
case ISD::LOAD: {

llvm/test/CodeGen/AMDGPU/addrspacecast.ll

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,31 @@ define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %pt
4040
ret void
4141
}
4242

43+
; Test handling inside a non-kernel
44+
; HSA-LABEL: {{^}}use_group_to_flat_addrspacecast_func:
45+
; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10{{$}}
46+
; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]]
47+
; CI-DAG: v_cmp_ne_u32_e32 vcc, -1, v0
48+
; CI-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
49+
; CI-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, v0
50+
51+
; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
52+
; GFX9-DAG: s_getreg_b32 [[SSRC_SHARED:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 16, 16)
53+
; GFX9-DAG: s_lshl_b32 [[SSRC_SHARED_BASE:s[0-9]+]], [[SSRC_SHARED]], 16
54+
; GFX9-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[SSRC_SHARED_BASE]]
55+
56+
; GFX9-XXX: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_shared_base
57+
; GFX9-DAG: v_cmp_ne_u32_e32 vcc, -1, v0
58+
; GFX9-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, v0, vcc
59+
; GFX9-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
60+
61+
; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]]
62+
define void @use_group_to_flat_addrspacecast_func(i32 addrspace(3)* %ptr) #0 {
63+
%stof = addrspacecast i32 addrspace(3)* %ptr to i32*
64+
store volatile i32 7, i32* %stof
65+
ret void
66+
}
67+
4368
; HSA-LABEL: {{^}}use_private_to_flat_addrspacecast:
4469
; HSA: enable_sgpr_private_segment_buffer = 1
4570
; HSA: enable_sgpr_dispatch_ptr = 0

llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll

Lines changed: 6 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,7 @@
22
; RUN: llc -amdgpu-fixed-function-abi -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
33

44
; GCN-LABEL: {{^}}use_dispatch_ptr:
5-
; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s4
6-
; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s5
7-
; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
5+
; GCN: s_load_dword s{{[0-9]+}}, s[4:5]
86
define hidden void @use_dispatch_ptr() #1 {
97
%dispatch_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0
108
%header_ptr = bitcast i8 addrspace(4)* %dispatch_ptr to i32 addrspace(4)*
@@ -13,9 +11,7 @@ define hidden void @use_dispatch_ptr() #1 {
1311
}
1412

1513
; GCN-LABEL: {{^}}use_queue_ptr:
16-
; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s6
17-
; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s7
18-
; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
14+
; GCN: s_load_dword s{{[0-9]+}}, s[6:7]
1915
define hidden void @use_queue_ptr() #1 {
2016
%queue_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0
2117
%header_ptr = bitcast i8 addrspace(4)* %queue_ptr to i32 addrspace(4)*
@@ -34,9 +30,7 @@ define hidden void @use_kernarg_segment_ptr() #1 {
3430
}
3531

3632
; GCN-LABEL: {{^}}use_implicitarg_ptr:
37-
; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s8
38-
; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s9
39-
; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
33+
; GCN: s_load_dword s{{[0-9]+}}, s[8:9]
4034
define hidden void @use_implicitarg_ptr() #1 {
4135
%implicit.arg.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #0
4236
%header_ptr = bitcast i8 addrspace(4)* %implicit.arg.ptr to i32 addrspace(4)*
@@ -198,15 +192,9 @@ define hidden void @other_arg_use_workgroup_id_z(i32 %arg0) #1 {
198192

199193
; GCN-LABEL: {{^}}use_every_sgpr_input:
200194
; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32{{$}}
201-
; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s4
202-
; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s5
203-
; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
204-
; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s6
205-
; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s7
206-
; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
207-
; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s8
208-
; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s9
209-
; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
195+
; GCN: s_load_dword s{{[0-9]+}}, s[4:5]
196+
; GCN: s_load_dword s{{[0-9]+}}, s[6:7]
197+
; GCN: s_load_dword s{{[0-9]+}}, s[8:9]
210198
; GCN: ; use s[10:11]
211199
; GCN: ; use s12
212200
; GCN: ; use s13

llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll

Lines changed: 11 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,7 @@
22
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-code-object-v3 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,GFX9 %s
33

44
; GCN-LABEL: {{^}}use_dispatch_ptr:
5-
; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s4
6-
; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s5
7-
; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
5+
; GCN: s_load_dword s{{[0-9]+}}, s[4:5]
86
define hidden void @use_dispatch_ptr() #1 {
97
%dispatch_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0
108
%header_ptr = bitcast i8 addrspace(4)* %dispatch_ptr to i32 addrspace(4)*
@@ -23,9 +21,7 @@ define amdgpu_kernel void @kern_indirect_use_dispatch_ptr(i32) #1 {
2321
}
2422

2523
; GCN-LABEL: {{^}}use_queue_ptr:
26-
; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s4
27-
; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s5
28-
; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
24+
; GCN: s_load_dword s{{[0-9]+}}, s[4:5]
2925
define hidden void @use_queue_ptr() #1 {
3026
%queue_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0
3127
%header_ptr = bitcast i8 addrspace(4)* %queue_ptr to i32 addrspace(4)*
@@ -44,10 +40,10 @@ define amdgpu_kernel void @kern_indirect_use_queue_ptr(i32) #1 {
4440
}
4541

4642
; GCN-LABEL: {{^}}use_queue_ptr_addrspacecast:
47-
; CIVI: flat_load_dword v[[HI:[0-9]+]], v[0:1]
43+
; CIVI: s_load_dword [[APERTURE_LOAD:s[0-9]+]], s[4:5], 0x10
4844
; GFX9: s_getreg_b32 [[APERTURE_LOAD:s[0-9]+]]
4945
; CIVI: v_mov_b32_e32 v[[LO:[0-9]+]], 16
50-
; GFX9: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE_LOAD]]
46+
; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE_LOAD]]
5147
; GFX9: {{flat|global}}_store_dword v{{\[[0-9]+}}:[[HI]]{{\]}}
5248
; CIVI: {{flat|global}}_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}
5349
define hidden void @use_queue_ptr_addrspacecast() #1 {
@@ -401,15 +397,10 @@ define amdgpu_kernel void @kern_indirect_other_arg_use_workgroup_id_z() #1 {
401397

402398
; GCN-LABEL: {{^}}use_every_sgpr_input:
403399
; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32{{$}}
404-
; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s4
405-
; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s5
406-
; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
407-
; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s6
408-
; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s7
409-
; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
410-
; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s8
411-
; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s9
412-
; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
400+
; GCN: s_load_dword s{{[0-9]+}}, s[4:5]
401+
; GCN: s_load_dword s{{[0-9]+}}, s[6:7]
402+
; GCN: s_load_dword s{{[0-9]+}}, s[8:9]
403+
413404
; GCN: ; use s[10:11]
414405
; GCN: ; use s12
415406
; GCN: ; use s13
@@ -551,15 +542,9 @@ define hidden void @func_use_every_sgpr_input_call_use_workgroup_id_xyz() #1 {
551542
; GCN: s_swappc_b64
552543

553544
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33{{$}}
554-
; GCN-DAG: v_mov_b32_e32 v[[LO1:[0-9]+]], s[[LO_X]]
555-
; GCN-DAG: v_mov_b32_e32 v[[HI1:[0-9]+]], s[[HI_X]]
556-
; GCN-DAG: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO1]]:[[HI1]]{{\]}}
557-
; GCN-DAG: v_mov_b32_e32 v[[LO2:[0-9]+]], s[[LO_Y]]
558-
; GCN-DAG: v_mov_b32_e32 v[[HI2:[0-9]+]], s[[HI_Y]]
559-
; GCN-DAG: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO2]]:[[HI2]]{{\]}}
560-
; GCN-DAG: v_mov_b32_e32 v[[LO3:[0-9]+]], s[[LO_Z]]
561-
; GCN-DAG: v_mov_b32_e32 v[[HI3:[0-9]+]], s[[HI_Z]]
562-
; GCN-DAG: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO3]]:[[HI3]]{{\]}}
545+
; GCN-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[LO_X]]:[[HI_X]]{{\]}}, 0x0
546+
; GCN-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[LO_Y]]:[[HI_Y]]{{\]}}, 0x0
547+
; GCN-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[LO_Z]]:[[HI_Z]]{{\]}}, 0x0
563548
; GCN: ; use
564549
; GCN: ; use [[SAVE_X]]
565550
; GCN: ; use [[SAVE_Y]]

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll

Lines changed: 6 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -59,12 +59,7 @@ define amdgpu_kernel void @opencl_kernel_implicitarg_ptr([112 x i8]) #1 {
5959

6060
; GCN-LABEL: {{^}}func_implicitarg_ptr:
6161
; GCN: s_waitcnt
62-
; MESA: v_mov_b32_e32 v0, s4
63-
; MESA: v_mov_b32_e32 v1, s5
64-
; MESA: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
65-
; HSA: v_mov_b32_e32 v0, s4
66-
; HSA: v_mov_b32_e32 v1, s5
67-
; HSA: flat_load_dword v0, v[0:1]
62+
; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
6863
; GCN-NEXT: s_waitcnt
6964
; GCN-NEXT: s_setpc_b64
7065
define void @func_implicitarg_ptr() #0 {
@@ -76,12 +71,7 @@ define void @func_implicitarg_ptr() #0 {
7671

7772
; GCN-LABEL: {{^}}opencl_func_implicitarg_ptr:
7873
; GCN: s_waitcnt
79-
; MESA: v_mov_b32_e32 v0, s4
80-
; MESA: v_mov_b32_e32 v1, s5
81-
; MESA: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
82-
; HSA: v_mov_b32_e32 v0, s4
83-
; HSA: v_mov_b32_e32 v1, s5
84-
; HSA: flat_load_dword v0, v[0:1]
74+
; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
8575
; GCN-NEXT: s_waitcnt
8676
; GCN-NEXT: s_setpc_b64
8777
define void @opencl_func_implicitarg_ptr() #0 {
@@ -165,16 +155,10 @@ define void @opencl_func_call_implicitarg_ptr_func() #0 {
165155

166156
; GCN-LABEL: {{^}}func_kernarg_implicitarg_ptr:
167157
; GCN: s_waitcnt
168-
; GCN-DAG: v_mov_b32_e32 v0, s4
169-
; GCN-DAG: v_mov_b32_e32 v1, s5
170158
; GCN-DAG: s_mov_b64 [[NULL:s\[[0-9]+:[0-9]+\]]], 0
171159
; GCN-DAG: s_load_dword s{{[0-9]+}}, [[NULL]], 0x0
172-
173-
; MESA: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
174-
175-
; HSA: flat_load_dword v0, v[0:1]
176-
177-
; GCN: s_waitcnt vmcnt(0)
160+
; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
161+
; GCN: s_waitcnt lgkmcnt(0)
178162
define void @func_kernarg_implicitarg_ptr() #0 {
179163
%kernarg.segment.ptr = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
180164
%implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
@@ -187,15 +171,10 @@ define void @func_kernarg_implicitarg_ptr() #0 {
187171

188172
; GCN-LABEL: {{^}}opencl_func_kernarg_implicitarg_ptr:
189173
; GCN: s_waitcnt
190-
; GCN-DAG: v_mov_b32_e32 v0, s4
191-
; GCN-DAG: v_mov_b32_e32 v1, s5
192174
; GCN-DAG: s_mov_b64 [[NULL:s\[[0-9]+:[0-9]+\]]], 0
193175
; GCN-DAG: s_load_dword s{{[0-9]+}}, [[NULL]], 0x0
194-
195-
; MESA: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
196-
; HSA: flat_load_dword v0, v[0:1]
197-
198-
; GCN: s_waitcnt vmcnt(0)
176+
; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
177+
; GCN: s_waitcnt lgkmcnt(0)
199178
define void @opencl_func_kernarg_implicitarg_ptr() #0 {
200179
%kernarg.segment.ptr = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
201180
%implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()

0 commit comments

Comments
 (0)