Skip to content

Commit 1f84495

Browse files
rovkajasilvanus
andauthored
[AMDGPU] Update target helpers & GCNSchedStrategy for dynamic VGPRs (#130047)
In dynamic VGPR mode, we can allocate up to 8 blocks of either 16 or 32 VGPRs (based on a chip-wide setting which we can model with a Subtarget feature). Update some of the subtarget helpers to reflect this. In particular: - getVGPRAllocGranule is set to the block size - getAddresableNumVGPR will limit itself to 8 * size of a block We also try to be more careful about how many VGPR blocks we allocate. Therefore, when deciding if we should revert scheduling after a given stage, we check that we haven't increased the number of VGPR blocks that need to be allocated. --------- Co-authored-by: Jannik Silvanus <jannik.silvanus@amd.com>
1 parent 0f2fb2b commit 1f84495

File tree

6 files changed

+86
-0
lines changed

6 files changed

+86
-0
lines changed

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1263,6 +1263,12 @@ def FeatureDynamicVGPR : SubtargetFeature <"dynamic-vgpr",
12631263
"Enable dynamic VGPR mode"
12641264
>;
12651265

1266+
def FeatureDynamicVGPRBlockSize32 : SubtargetFeature<"dynamic-vgpr-block-size-32",
1267+
"DynamicVGPRBlockSize32",
1268+
"true",
1269+
"Use a block size of 32 for dynamic VGPR allocation (default is 16)"
1270+
>;
1271+
12661272
// Dummy feature used to disable assembler instructions.
12671273
def FeatureDisable : SubtargetFeature<"",
12681274
"FeatureDisable","true",

llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1452,6 +1452,16 @@ bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) {
14521452
if (WavesAfter < DAG.MinOccupancy)
14531453
return true;
14541454

1455+
// For dynamic VGPR mode, we don't want to waste any VGPR blocks.
1456+
if (ST.isDynamicVGPREnabled()) {
1457+
unsigned BlocksBefore = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
1458+
&ST, PressureBefore.getVGPRNum(false));
1459+
unsigned BlocksAfter = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
1460+
&ST, PressureAfter.getVGPRNum(false));
1461+
if (BlocksAfter > BlocksBefore)
1462+
return true;
1463+
}
1464+
14551465
return false;
14561466
}
14571467

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
192192
unsigned MaxHardClauseLength = 0;
193193
bool SupportsSRAMECC = false;
194194
bool DynamicVGPR = false;
195+
bool DynamicVGPRBlockSize32 = false;
195196

196197
// This should not be used directly. 'TargetID' tracks the dynamic settings
197198
// for SRAMECC.

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1166,6 +1166,9 @@ unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI,
11661166
if (STI->getFeatureBits().test(FeatureGFX90AInsts))
11671167
return 8;
11681168

1169+
if (STI->getFeatureBits().test(FeatureDynamicVGPR))
1170+
return STI->getFeatureBits().test(FeatureDynamicVGPRBlockSize32) ? 32 : 16;
1171+
11691172
bool IsWave32 = EnableWavefrontSize32 ?
11701173
*EnableWavefrontSize32 :
11711174
STI->getFeatureBits().test(FeatureWavefrontSize32);
@@ -1207,6 +1210,9 @@ unsigned getAddressableNumArchVGPRs(const MCSubtargetInfo *STI) { return 256; }
12071210
unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI) {
12081211
if (STI->getFeatureBits().test(FeatureGFX90AInsts))
12091212
return 512;
1213+
if (STI->getFeatureBits().test(FeatureDynamicVGPR))
1214+
// On GFX12 we can allocate at most 8 blocks of VGPRs.
1215+
return 8 * getVGPRAllocGranule(STI);
12101216
return getAddressableNumArchVGPRs(STI);
12111217
}
12121218

llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,24 @@ static void testGPRLimits(const char *RegName, bool TestW32W64,
153153
EXPECT_TRUE(ErrStr.empty()) << ErrStr;
154154
}
155155

156+
static void testDynamicVGPRLimits(StringRef CPUName, StringRef FS,
157+
TestFuncTy test) {
158+
auto TM = createAMDGPUTargetMachine("amdgcn-amd-", CPUName,
159+
"+dynamic-vgpr," + FS.str());
160+
ASSERT_TRUE(TM) << "No target machine";
161+
162+
GCNSubtarget ST(TM->getTargetTriple(), std::string(TM->getTargetCPU()),
163+
std::string(TM->getTargetFeatureString()), *TM);
164+
ASSERT_TRUE(ST.getFeatureBits().test(AMDGPU::FeatureDynamicVGPR));
165+
166+
std::stringstream Table;
167+
bool Success = testAndRecord(Table, ST, test);
168+
EXPECT_TRUE(Success && !PrintCpuRegLimits)
169+
<< CPUName << " dynamic VGPR " << FS
170+
<< ":\nOcc MinVGPR MaxVGPR\n"
171+
<< Table.str() << '\n';
172+
}
173+
156174
TEST(AMDGPU, TestVGPRLimitsPerOccupancy) {
157175
auto test = [](std::stringstream &OS, unsigned Occ, const GCNSubtarget &ST) {
158176
unsigned MaxVGPRNum = ST.getAddressableNumVGPRs();
@@ -164,6 +182,50 @@ TEST(AMDGPU, TestVGPRLimitsPerOccupancy) {
164182
};
165183

166184
testGPRLimits("VGPR", true, test);
185+
186+
testDynamicVGPRLimits("gfx1200", "+wavefrontsize32", test);
187+
testDynamicVGPRLimits("gfx1200",
188+
"+wavefrontsize32,+dynamic-vgpr-block-size-32", test);
189+
}
190+
191+
static void testAbsoluteLimits(StringRef CPUName, StringRef FS,
192+
unsigned ExpectedMinOcc, unsigned ExpectedMaxOcc,
193+
unsigned ExpectedMaxVGPRs) {
194+
auto TM = createAMDGPUTargetMachine("amdgcn-amd-", CPUName, FS);
195+
ASSERT_TRUE(TM) << "No target machine";
196+
197+
GCNSubtarget ST(TM->getTargetTriple(), std::string(TM->getTargetCPU()),
198+
std::string(TM->getTargetFeatureString()), *TM);
199+
200+
// Test function without attributes.
201+
LLVMContext Context;
202+
Module M("", Context);
203+
Function *Func =
204+
Function::Create(FunctionType::get(Type::getVoidTy(Context), false),
205+
GlobalValue::ExternalLinkage, "testFunc", &M);
206+
Func->setCallingConv(CallingConv::AMDGPU_CS_Chain);
207+
Func->addFnAttr("amdgpu-flat-work-group-size", "1,32");
208+
209+
auto Range = ST.getWavesPerEU(*Func);
210+
EXPECT_EQ(ExpectedMinOcc, Range.first) << CPUName << ' ' << FS;
211+
EXPECT_EQ(ExpectedMaxOcc, Range.second) << CPUName << ' ' << FS;
212+
EXPECT_EQ(ExpectedMaxVGPRs, ST.getMaxNumVGPRs(*Func)) << CPUName << ' ' << FS;
213+
EXPECT_EQ(ExpectedMaxVGPRs, ST.getAddressableNumVGPRs())
214+
<< CPUName << ' ' << FS;
215+
216+
// Function with requested 'amdgpu-waves-per-eu' in a valid range.
217+
Func->addFnAttr("amdgpu-waves-per-eu", "10,12");
218+
Range = ST.getWavesPerEU(*Func);
219+
EXPECT_EQ(10u, Range.first) << CPUName << ' ' << FS;
220+
EXPECT_EQ(12u, Range.second) << CPUName << ' ' << FS;
221+
}
222+
223+
TEST(AMDGPU, TestOccupancyAbsoluteLimits) {
224+
testAbsoluteLimits("gfx1200", "+wavefrontsize32", 1, 16, 256);
225+
testAbsoluteLimits("gfx1200", "+wavefrontsize32,+dynamic-vgpr", 1, 16, 128);
226+
testAbsoluteLimits(
227+
"gfx1200", "+wavefrontsize32,+dynamic-vgpr,+dynamic-vgpr-block-size-32",
228+
1, 16, 256);
167229
}
168230

169231
static const char *printSubReg(const TargetRegisterInfo &TRI, unsigned SubReg) {

llvm/unittests/Target/AMDGPU/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ set(LLVM_LINK_COMPONENTS
1313
Core
1414
GlobalISel
1515
MC
16+
MIRParser
1617
Support
1718
TargetParser
1819
)

0 commit comments

Comments
 (0)