Skip to content

Commit a6f4942

Browse files
committed
AMDGPU: Optimize outgoing workitem ID based on reqd_work_group_size
If we know we we aren't using a component from the kernel, we can save a few bit packing instructions. We're still enabling the VGPR input to the kernel though.
1 parent c719a85 commit a6f4942

File tree

4 files changed

+396
-457
lines changed

4 files changed

+396
-457
lines changed

llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -879,13 +879,17 @@ bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder &MIRBuilder,
879879
Register InputReg;
880880
if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
881881
NeedWorkItemIDX) {
882-
InputReg = MRI.createGenericVirtualRegister(S32);
883-
LI->loadInputValue(InputReg, MIRBuilder, IncomingArgX,
884-
std::get<1>(WorkitemIDX), std::get<2>(WorkitemIDX));
882+
if (ST.getMaxWorkitemID(MF.getFunction(), 0) != 0) {
883+
InputReg = MRI.createGenericVirtualRegister(S32);
884+
LI->loadInputValue(InputReg, MIRBuilder, IncomingArgX,
885+
std::get<1>(WorkitemIDX), std::get<2>(WorkitemIDX));
886+
} else {
887+
InputReg = MIRBuilder.buildConstant(S32, 0).getReg(0);
888+
}
885889
}
886890

887891
if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
888-
NeedWorkItemIDY) {
892+
NeedWorkItemIDY && ST.getMaxWorkitemID(MF.getFunction(), 1) != 0) {
889893
Register Y = MRI.createGenericVirtualRegister(S32);
890894
LI->loadInputValue(Y, MIRBuilder, IncomingArgY, std::get<1>(WorkitemIDY),
891895
std::get<2>(WorkitemIDY));
@@ -895,7 +899,7 @@ bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder &MIRBuilder,
895899
}
896900

897901
if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
898-
NeedWorkItemIDZ) {
902+
NeedWorkItemIDZ && ST.getMaxWorkitemID(MF.getFunction(), 2) != 0) {
899903
Register Z = MRI.createGenericVirtualRegister(S32);
900904
LI->loadInputValue(Z, MIRBuilder, IncomingArgZ, std::get<1>(WorkitemIDZ),
901905
std::get<2>(WorkitemIDZ));

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2772,6 +2772,7 @@ void SITargetLowering::passSpecialInputs(
27722772

27732773
SelectionDAG &DAG = CLI.DAG;
27742774
const SDLoc &DL = CLI.DL;
2775+
const Function &F = DAG.getMachineFunction().getFunction();
27752776

27762777
const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
27772778
const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
@@ -2883,11 +2884,16 @@ void SITargetLowering::passSpecialInputs(
28832884

28842885
// If incoming ids are not packed we need to pack them.
28852886
if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
2886-
NeedWorkItemIDX)
2887-
InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
2887+
NeedWorkItemIDX) {
2888+
if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
2889+
InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
2890+
} else {
2891+
InputReg = DAG.getConstant(0, DL, MVT::i32);
2892+
}
2893+
}
28882894

28892895
if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
2890-
NeedWorkItemIDY) {
2896+
NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
28912897
SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
28922898
Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
28932899
DAG.getShiftAmountConstant(10, MVT::i32, SL));
@@ -2896,7 +2902,7 @@ void SITargetLowering::passSpecialInputs(
28962902
}
28972903

28982904
if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
2899-
NeedWorkItemIDZ) {
2905+
NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
29002906
SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
29012907
Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
29022908
DAG.getShiftAmountConstant(20, MVT::i32, SL));

0 commit comments

Comments
 (0)