Skip to content

Commit 59994c2

Browse files
committed
AMDGPU: Select workitem ID intrinsics to 0 with req_work_group_size
Shockingly we weren't doing this already. We should probably have this be done earlier in the IR too, but it's still helpful to have the lowering guarantee it so that we can modify the ABI implicit inputs based on it.
1 parent a6f4942 commit 59994c2

File tree

4 files changed

+177
-12
lines changed

4 files changed

+177
-12
lines changed

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2888,6 +2888,8 @@ bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
28882888

28892889
Register AndMaskSrc = LiveIn;
28902890

2891+
// TODO: Avoid clearing the high bits if we know workitem id y/z are always
2892+
// 0.
28912893
if (Shift != 0) {
28922894
auto ShiftAmt = B.buildConstant(S32, Shift);
28932895
AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
@@ -4966,6 +4968,12 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
49664968
return true;
49674969
}
49684970

4971+
static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C) {
4972+
B.buildConstant(MI.getOperand(0).getReg(), C);
4973+
MI.eraseFromParent();
4974+
return true;
4975+
}
4976+
49694977
bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
49704978
MachineInstr &MI) const {
49714979
MachineIRBuilder &B = Helper.MIRBuilder;
@@ -5069,12 +5077,20 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
50695077
case Intrinsic::amdgcn_implicitarg_ptr:
50705078
return legalizeImplicitArgPtr(MI, MRI, B);
50715079
case Intrinsic::amdgcn_workitem_id_x:
5080+
if (ST.getMaxWorkitemID(B.getMF().getFunction(), 0) == 0)
5081+
return replaceWithConstant(B, MI, 0);
50725082
return legalizePreloadedArgIntrin(MI, MRI, B,
50735083
AMDGPUFunctionArgInfo::WORKITEM_ID_X);
50745084
case Intrinsic::amdgcn_workitem_id_y:
5085+
if (ST.getMaxWorkitemID(B.getMF().getFunction(), 1) == 0)
5086+
return replaceWithConstant(B, MI, 0);
5087+
50755088
return legalizePreloadedArgIntrin(MI, MRI, B,
50765089
AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
50775090
case Intrinsic::amdgcn_workitem_id_z:
5091+
if (ST.getMaxWorkitemID(B.getMF().getFunction(), 2) == 0)
5092+
return replaceWithConstant(B, MI, 0);
5093+
50785094
return legalizePreloadedArgIntrin(MI, MRI, B,
50795095
AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
50805096
case Intrinsic::amdgcn_workgroup_id_x:

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6756,14 +6756,23 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
67566756
return getPreloadedValue(DAG, *MFI, VT,
67576757
AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
67586758
case Intrinsic::amdgcn_workitem_id_x:
6759+
if (Subtarget->getMaxWorkitemID(MF.getFunction(), 0) == 0)
6760+
return DAG.getConstant(0, DL, MVT::i32);
6761+
67596762
return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
67606763
SDLoc(DAG.getEntryNode()),
67616764
MFI->getArgInfo().WorkItemIDX);
67626765
case Intrinsic::amdgcn_workitem_id_y:
6766+
if (Subtarget->getMaxWorkitemID(MF.getFunction(), 1) == 0)
6767+
return DAG.getConstant(0, DL, MVT::i32);
6768+
67636769
return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
67646770
SDLoc(DAG.getEntryNode()),
67656771
MFI->getArgInfo().WorkItemIDY);
67666772
case Intrinsic::amdgcn_workitem_id_z:
6773+
if (Subtarget->getMaxWorkitemID(MF.getFunction(), 2) == 0)
6774+
return DAG.getConstant(0, DL, MVT::i32);
6775+
67676776
return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
67686777
SDLoc(DAG.getEntryNode()),
67696778
MFI->getArgInfo().WorkItemIDZ);

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll

Lines changed: 76 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
1-
; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=2 -mcpu=kaveri -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,HSA,CO-V2 %s
2-
; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=2 -mcpu=carrizo -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,HSA,CO-V2 %s
3-
; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=hawaii -mattr=+flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,MESA %s
4-
; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=tonga -mattr=+flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,MESA %s
5-
; RUN: llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mattr=+flat-for-global -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2 %s
6-
; RUN: llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2 %s
1+
; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=2 -mcpu=kaveri -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,HSA,CO-V2,UNPACKED %s
2+
; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=2 -mcpu=carrizo -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,HSA,CO-V2,UNPACKED %s
3+
; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=hawaii -mattr=+flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,MESA,UNPACKED %s
4+
; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=tonga -mattr=+flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,MESA,UNPACKED %s
5+
; RUN: llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mattr=+flat-for-global -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2,UNPACKED %s
6+
; RUN: llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2,UNPACKED %s
77
; RUN: llc -global-isel -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,PACKED-TID %s
88

99
declare i32 @llvm.amdgcn.workitem.id.x() #0
@@ -125,5 +125,75 @@ define void @test_workitem_id_z_func(i32 addrspace(1)* %out) #1 {
125125
ret void
126126
}
127127

128+
; FIXME: Should be able to avoid enabling in kernel inputs
129+
; FIXME: Packed tid should avoid the and
130+
; ALL-LABEL: {{^}}test_reqd_workgroup_size_x_only:
131+
; CO-V2: enable_vgpr_workitem_id = 2
132+
133+
; ALL-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
134+
; UNPACKED-DAG: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
135+
136+
; PACKED: v_and_b32_e32 [[MASKED:v[0-9]+]], 0x3ff, v0
137+
; PACKED: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]]
138+
139+
; ALL: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]]
140+
; ALL: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]]
141+
define amdgpu_kernel void @test_reqd_workgroup_size_x_only(i32* %out) !reqd_work_group_size !0 {
142+
%id.x = call i32 @llvm.amdgcn.workitem.id.x()
143+
%id.y = call i32 @llvm.amdgcn.workitem.id.y()
144+
%id.z = call i32 @llvm.amdgcn.workitem.id.z()
145+
store volatile i32 %id.x, i32* %out
146+
store volatile i32 %id.y, i32* %out
147+
store volatile i32 %id.z, i32* %out
148+
ret void
149+
}
150+
151+
; ALL-LABEL: {{^}}test_reqd_workgroup_size_y_only:
152+
; CO-V2: enable_vgpr_workitem_id = 2
153+
154+
; ALL: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
155+
; ALL: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]]
156+
157+
; UNPACKED: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
158+
159+
; PACKED: v_bfe_u32 [[MASKED:v[0-9]+]], v0, 10, 10
160+
; PACKED: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]]
161+
162+
; ALL: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]]
163+
define amdgpu_kernel void @test_reqd_workgroup_size_y_only(i32* %out) !reqd_work_group_size !1 {
164+
%id.x = call i32 @llvm.amdgcn.workitem.id.x()
165+
%id.y = call i32 @llvm.amdgcn.workitem.id.y()
166+
%id.z = call i32 @llvm.amdgcn.workitem.id.z()
167+
store volatile i32 %id.x, i32* %out
168+
store volatile i32 %id.y, i32* %out
169+
store volatile i32 %id.z, i32* %out
170+
ret void
171+
}
172+
173+
; ALL-LABEL: {{^}}test_reqd_workgroup_size_z_only:
174+
; CO-V2: enable_vgpr_workitem_id = 2
175+
176+
; ALL: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
177+
; ALL: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]]
178+
; ALL: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]]
179+
180+
; UNPACKED: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v2
181+
182+
; PACKED: v_bfe_u32 [[MASKED:v[0-9]+]], v0, 10, 20
183+
; PACKED: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]]
184+
define amdgpu_kernel void @test_reqd_workgroup_size_z_only(i32* %out) !reqd_work_group_size !2 {
185+
%id.x = call i32 @llvm.amdgcn.workitem.id.x()
186+
%id.y = call i32 @llvm.amdgcn.workitem.id.y()
187+
%id.z = call i32 @llvm.amdgcn.workitem.id.z()
188+
store volatile i32 %id.x, i32* %out
189+
store volatile i32 %id.y, i32* %out
190+
store volatile i32 %id.z, i32* %out
191+
ret void
192+
}
193+
128194
attributes #0 = { nounwind readnone }
129195
attributes #1 = { nounwind }
196+
197+
!0 = !{i32 64, i32 1, i32 1}
198+
!1 = !{i32 1, i32 64, i32 1}
199+
!2 = !{i32 1, i32 1, i32 64}

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll

Lines changed: 76 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
1-
; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=2 -mcpu=kaveri -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,CO-V2 %s
2-
; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=2 -mcpu=carrizo -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,CO-V2 %s
3-
; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,MESA %s
4-
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,MESA %s
5-
; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2 %s
6-
; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2 %s
1+
; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=2 -mcpu=kaveri -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,CO-V2,UNPACKED %s
2+
; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=2 -mcpu=carrizo -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,CO-V2,UNPACKED %s
3+
; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,MESA,UNPACKED %s
4+
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,MESA,UNPACKED %s
5+
; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2,UNPACKED %s
6+
; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2,UNPACKED %s
77
; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,PACKED-TID %s
88

99
declare i32 @llvm.amdgcn.workitem.id.x() #0
@@ -63,5 +63,75 @@ define amdgpu_kernel void @test_workitem_id_z(i32 addrspace(1)* %out) #1 {
6363
ret void
6464
}
6565

66+
; FIXME: Should be able to avoid enabling in kernel inputs
67+
; FIXME: Packed tid should avoid the and
68+
; ALL-LABEL: {{^}}test_reqd_workgroup_size_x_only:
69+
; CO-V2: enable_vgpr_workitem_id = 2
70+
71+
; ALL-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
72+
; UNPACKED-DAG: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
73+
74+
; PACKED: v_and_b32_e32 [[MASKED:v[0-9]+]], 0x3ff, v0
75+
; PACKED: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]]
76+
77+
; ALL: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]]
78+
; ALL: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]]
79+
define amdgpu_kernel void @test_reqd_workgroup_size_x_only(i32* %out) !reqd_work_group_size !0 {
80+
%id.x = call i32 @llvm.amdgcn.workitem.id.x()
81+
%id.y = call i32 @llvm.amdgcn.workitem.id.y()
82+
%id.z = call i32 @llvm.amdgcn.workitem.id.z()
83+
store volatile i32 %id.x, i32* %out
84+
store volatile i32 %id.y, i32* %out
85+
store volatile i32 %id.z, i32* %out
86+
ret void
87+
}
88+
89+
; ALL-LABEL: {{^}}test_reqd_workgroup_size_y_only:
90+
; CO-V2: enable_vgpr_workitem_id = 2
91+
92+
; ALL: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
93+
; ALL: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]]
94+
95+
; UNPACKED: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
96+
97+
; PACKED: v_bfe_u32 [[MASKED:v[0-9]+]], v0, 10, 10
98+
; PACKED: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]]
99+
100+
; ALL: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]]
101+
define amdgpu_kernel void @test_reqd_workgroup_size_y_only(i32* %out) !reqd_work_group_size !1 {
102+
%id.x = call i32 @llvm.amdgcn.workitem.id.x()
103+
%id.y = call i32 @llvm.amdgcn.workitem.id.y()
104+
%id.z = call i32 @llvm.amdgcn.workitem.id.z()
105+
store volatile i32 %id.x, i32* %out
106+
store volatile i32 %id.y, i32* %out
107+
store volatile i32 %id.z, i32* %out
108+
ret void
109+
}
110+
111+
; ALL-LABEL: {{^}}test_reqd_workgroup_size_z_only:
112+
; CO-V2: enable_vgpr_workitem_id = 2
113+
114+
; ALL: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
115+
; ALL: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]]
116+
; ALL: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]]
117+
118+
; UNPACKED: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v2
119+
120+
; PACKED: v_bfe_u32 [[MASKED:v[0-9]+]], v0, 10, 20
121+
; PACKED: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]]
122+
define amdgpu_kernel void @test_reqd_workgroup_size_z_only(i32* %out) !reqd_work_group_size !2 {
123+
%id.x = call i32 @llvm.amdgcn.workitem.id.x()
124+
%id.y = call i32 @llvm.amdgcn.workitem.id.y()
125+
%id.z = call i32 @llvm.amdgcn.workitem.id.z()
126+
store volatile i32 %id.x, i32* %out
127+
store volatile i32 %id.y, i32* %out
128+
store volatile i32 %id.z, i32* %out
129+
ret void
130+
}
131+
66132
attributes #0 = { nounwind readnone }
67133
attributes #1 = { nounwind }
134+
135+
!0 = !{i32 64, i32 1, i32 1}
136+
!1 = !{i32 1, i32 64, i32 1}
137+
!2 = !{i32 1, i32 1, i32 64}

0 commit comments

Comments
 (0)