Skip to content

Commit fac093d

Browse files
piotrAMDrampitec
andauthored
[AMDGPU] Update IEEE and DX10_CLAMP for GFX12 (#75030)
Co-authored-by: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
1 parent 4fc6048 commit fac093d

26 files changed

+1656
-157
lines changed

llvm/docs/AMDGPUUsage.rst

Lines changed: 120 additions & 103 deletions
Large diffs are not rendered by default.

llvm/include/llvm/Support/AMDHSAKernelDescriptor.h

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -88,12 +88,18 @@ enum : uint8_t {
8888
// [GFX6-GFX9].
8989
#define COMPUTE_PGM_RSRC1_GFX6_GFX9(NAME, SHIFT, WIDTH) \
9090
AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC1_GFX6_GFX9_ ## NAME, SHIFT, WIDTH)
91+
// [GFX6-GFX11].
92+
#define COMPUTE_PGM_RSRC1_GFX6_GFX11(NAME, SHIFT, WIDTH) \
93+
AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC1_GFX6_GFX11_##NAME, SHIFT, WIDTH)
9194
// GFX9+.
9295
#define COMPUTE_PGM_RSRC1_GFX9_PLUS(NAME, SHIFT, WIDTH) \
9396
AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC1_GFX9_PLUS_ ## NAME, SHIFT, WIDTH)
9497
// GFX10+.
9598
#define COMPUTE_PGM_RSRC1_GFX10_PLUS(NAME, SHIFT, WIDTH) \
9699
AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC1_GFX10_PLUS_ ## NAME, SHIFT, WIDTH)
100+
// GFX12+.
101+
#define COMPUTE_PGM_RSRC1_GFX12_PLUS(NAME, SHIFT, WIDTH) \
102+
AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC1_GFX12_PLUS_##NAME, SHIFT, WIDTH)
97103
enum : int32_t {
98104
COMPUTE_PGM_RSRC1(GRANULATED_WORKITEM_VGPR_COUNT, 0, 6),
99105
COMPUTE_PGM_RSRC1(GRANULATED_WAVEFRONT_SGPR_COUNT, 6, 4),
@@ -103,9 +109,11 @@ enum : int32_t {
103109
COMPUTE_PGM_RSRC1(FLOAT_DENORM_MODE_32, 16, 2),
104110
COMPUTE_PGM_RSRC1(FLOAT_DENORM_MODE_16_64, 18, 2),
105111
COMPUTE_PGM_RSRC1(PRIV, 20, 1),
106-
COMPUTE_PGM_RSRC1(ENABLE_DX10_CLAMP, 21, 1),
112+
COMPUTE_PGM_RSRC1_GFX6_GFX11(ENABLE_DX10_CLAMP, 21, 1),
113+
COMPUTE_PGM_RSRC1_GFX12_PLUS(ENABLE_WG_RR_EN, 21, 1),
107114
COMPUTE_PGM_RSRC1(DEBUG_MODE, 22, 1),
108-
COMPUTE_PGM_RSRC1(ENABLE_IEEE_MODE, 23, 1),
115+
COMPUTE_PGM_RSRC1_GFX6_GFX11(ENABLE_IEEE_MODE, 23, 1),
116+
COMPUTE_PGM_RSRC1_GFX12_PLUS(DISABLE_PERF, 23, 1),
109117
COMPUTE_PGM_RSRC1(BULKY, 24, 1),
110118
COMPUTE_PGM_RSRC1(CDBG_USER, 25, 1),
111119
COMPUTE_PGM_RSRC1_GFX6_GFX8(RESERVED0, 26, 1),

llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -426,7 +426,7 @@ amdhsa::kernel_descriptor_t AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(
426426
memset(&KernelDescriptor, 0x0, sizeof(KernelDescriptor));
427427

428428
assert(isUInt<32>(PI.ScratchSize));
429-
assert(isUInt<32>(PI.getComputePGMRSrc1()));
429+
assert(isUInt<32>(PI.getComputePGMRSrc1(STM)));
430430
assert(isUInt<32>(PI.getComputePGMRSrc2()));
431431

432432
KernelDescriptor.group_segment_fixed_size = PI.LDSSize;
@@ -435,7 +435,7 @@ amdhsa::kernel_descriptor_t AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(
435435
Align MaxKernArgAlign;
436436
KernelDescriptor.kernarg_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
437437

438-
KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1();
438+
KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1(STM);
439439
KernelDescriptor.compute_pgm_rsrc2 = PI.getComputePGMRSrc2();
440440
KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF);
441441

@@ -974,7 +974,7 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
974974
if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
975975
OutStreamer->emitInt32(R_00B848_COMPUTE_PGM_RSRC1);
976976

977-
OutStreamer->emitInt32(CurrentProgramInfo.getComputePGMRSrc1());
977+
OutStreamer->emitInt32(CurrentProgramInfo.getComputePGMRSrc1(STM));
978978

979979
OutStreamer->emitInt32(R_00B84C_COMPUTE_PGM_RSRC2);
980980
OutStreamer->emitInt32(CurrentProgramInfo.getComputePGMRSrc2());
@@ -1038,7 +1038,7 @@ void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
10381038

10391039
MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU);
10401040
if (MD->getPALMajorVersion() < 3) {
1041-
MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC));
1041+
MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC, STM));
10421042
if (AMDGPU::isCompute(CC)) {
10431043
MD->setRsrc2(CC, CurrentProgramInfo.getComputePGMRSrc2());
10441044
} else {
@@ -1116,10 +1116,11 @@ void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
11161116
const MachineFrameInfo &MFI = MF.getFrameInfo();
11171117
StringRef FnName = MF.getFunction().getName();
11181118
MD->setFunctionScratchSize(FnName, MFI.getStackSize());
1119+
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
11191120

11201121
// Set compute registers
11211122
MD->setRsrc1(CallingConv::AMDGPU_CS,
1122-
CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS));
1123+
CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS, ST));
11231124
MD->setRsrc2(CallingConv::AMDGPU_CS, CurrentProgramInfo.getComputePGMRSrc2());
11241125

11251126
// Set optional info
@@ -1155,7 +1156,7 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
11551156
AMDGPU::initDefaultAMDKernelCodeT(Out, &STM);
11561157

11571158
Out.compute_pgm_resource_registers =
1158-
CurrentProgramInfo.getComputePGMRSrc1() |
1159+
CurrentProgramInfo.getComputePGMRSrc1(STM) |
11591160
(CurrentProgramInfo.getComputePGMRSrc2() << 32);
11601161
Out.code_properties |= AMD_CODE_PROPERTY_IS_PTR64;
11611162

llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2199,7 +2199,7 @@ bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
21992199
auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
22002200
Impl.DT = DTWP ? &DTWP->getDomTree() : nullptr;
22012201
Impl.HasUnsafeFPMath = hasUnsafeFPMath(F);
2202-
SIModeRegisterDefaults Mode(F);
2202+
SIModeRegisterDefaults Mode(F, *Impl.ST);
22032203
Impl.HasFP32DenormalFlush =
22042204
Mode.FP32Denormals == DenormalMode::getPreserveSign();
22052205
return Impl.run(F);
@@ -2216,7 +2216,7 @@ PreservedAnalyses AMDGPUCodeGenPreparePass::run(Function &F,
22162216
Impl.UA = &FAM.getResult<UniformityInfoAnalysis>(F);
22172217
Impl.DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
22182218
Impl.HasUnsafeFPMath = hasUnsafeFPMath(F);
2219-
SIModeRegisterDefaults Mode(F);
2219+
SIModeRegisterDefaults Mode(F, *Impl.ST);
22202220
Impl.HasFP32DenormalFlush =
22212221
Mode.FP32Denormals == DenormalMode::getPreserveSign();
22222222
PreservedAnalyses PA = PreservedAnalyses::none();

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
132132
}
133133
#endif
134134
Subtarget = &MF.getSubtarget<GCNSubtarget>();
135-
Mode = SIModeRegisterDefaults(MF.getFunction());
135+
Mode = SIModeRegisterDefaults(MF.getFunction(), *Subtarget);
136136
return SelectionDAGISel::runOnMachineFunction(MF);
137137
}
138138

llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1499,13 +1499,13 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
14991499
static_cast<const yaml::SIMachineFunctionInfo &>(MFI_);
15001500
MachineFunction &MF = PFS.MF;
15011501
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1502+
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
15021503

15031504
if (MFI->initializeBaseYamlFields(YamlMFI, MF, PFS, Error, SourceRange))
15041505
return true;
15051506

15061507
if (MFI->Occupancy == 0) {
15071508
// Fixup the subtarget dependent default value.
1508-
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
15091509
MFI->Occupancy = ST.computeOccupancy(MF.getFunction(), MFI->getLDSSize());
15101510
}
15111511

@@ -1659,8 +1659,10 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
16591659
MFI->ArgInfo.WorkItemIDZ, 0, 0)))
16601660
return true;
16611661

1662-
MFI->Mode.IEEE = YamlMFI.Mode.IEEE;
1663-
MFI->Mode.DX10Clamp = YamlMFI.Mode.DX10Clamp;
1662+
if (ST.hasIEEEMode())
1663+
MFI->Mode.IEEE = YamlMFI.Mode.IEEE;
1664+
if (ST.hasDX10ClampMode())
1665+
MFI->Mode.DX10Clamp = YamlMFI.Mode.DX10Clamp;
16641666

16651667
// FIXME: Move proper support for denormal-fp-math into base MachineFunction
16661668
MFI->Mode.FP32Denormals.Input = YamlMFI.Mode.FP32InputDenormals

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -296,7 +296,7 @@ GCNTTIImpl::GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
296296
ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
297297
TLI(ST->getTargetLowering()), CommonTTI(TM, F),
298298
IsGraphics(AMDGPU::isGraphics(F.getCallingConv())) {
299-
SIModeRegisterDefaults Mode(F);
299+
SIModeRegisterDefaults Mode(F, *ST);
300300
HasFP32Denormals = Mode.FP32Denormals != DenormalMode::getPreserveSign();
301301
HasFP64FP16Denormals =
302302
Mode.FP64FP16Denormals != DenormalMode::getPreserveSign();
@@ -1163,8 +1163,8 @@ bool GCNTTIImpl::areInlineCompatible(const Function *Caller,
11631163

11641164
// FIXME: dx10_clamp can just take the caller setting, but there seems to be
11651165
// no way to support merge for backend defined attributes.
1166-
SIModeRegisterDefaults CallerMode(*Caller);
1167-
SIModeRegisterDefaults CalleeMode(*Callee);
1166+
SIModeRegisterDefaults CallerMode(*Caller, *CallerST);
1167+
SIModeRegisterDefaults CalleeMode(*Callee, *CalleeST);
11681168
if (!CallerMode.isInlineCompatible(CalleeMode))
11691169
return false;
11701170

llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5342,11 +5342,17 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
53425342
COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64, Val,
53435343
ValRange);
53445344
} else if (ID == ".amdhsa_dx10_clamp") {
5345+
if (IVersion.Major >= 12)
5346+
return Error(IDRange.Start, "directive unsupported on gfx12+", IDRange);
53455347
PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1,
5346-
COMPUTE_PGM_RSRC1_ENABLE_DX10_CLAMP, Val, ValRange);
5348+
COMPUTE_PGM_RSRC1_GFX6_GFX11_ENABLE_DX10_CLAMP, Val,
5349+
ValRange);
53475350
} else if (ID == ".amdhsa_ieee_mode") {
5348-
PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_ENABLE_IEEE_MODE,
5349-
Val, ValRange);
5351+
if (IVersion.Major >= 12)
5352+
return Error(IDRange.Start, "directive unsupported on gfx12+", IDRange);
5353+
PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1,
5354+
COMPUTE_PGM_RSRC1_GFX6_GFX11_ENABLE_IEEE_MODE, Val,
5355+
ValRange);
53505356
} else if (ID == ".amdhsa_fp16_overflow") {
53515357
if (IVersion.Major < 9)
53525358
return Error(IDRange.Start, "directive requires gfx9+", IDRange);
@@ -5409,6 +5415,12 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
54095415
PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
54105416
COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_INT_DIVIDE_BY_ZERO,
54115417
Val, ValRange);
5418+
} else if (ID == ".amdhsa_round_robin_scheduling") {
5419+
if (IVersion.Major < 12)
5420+
return Error(IDRange.Start, "directive requires gfx12+", IDRange);
5421+
PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1,
5422+
COMPUTE_PGM_RSRC1_GFX12_PLUS_ENABLE_WG_RR_EN, Val,
5423+
ValRange);
54125424
} else {
54135425
return Error(IDRange.Start, "unknown .amdhsa_kernel directive", IDRange);
54145426
}
@@ -5562,6 +5574,18 @@ bool AMDGPUAsmParser::ParseAMDKernelCodeTValue(StringRef ID,
55625574
}
55635575
Lex();
55645576

5577+
if (ID == "enable_dx10_clamp") {
5578+
if (G_00B848_DX10_CLAMP(Header.compute_pgm_resource_registers) &&
5579+
isGFX12Plus())
5580+
return TokError("enable_dx10_clamp=1 is not allowed on GFX12+");
5581+
}
5582+
5583+
if (ID == "enable_ieee_mode") {
5584+
if (G_00B848_IEEE_MODE(Header.compute_pgm_resource_registers) &&
5585+
isGFX12Plus())
5586+
return TokError("enable_ieee_mode=1 is not allowed on GFX12+");
5587+
}
5588+
55655589
if (ID == "enable_wavefront_size32") {
55665590
if (Header.code_properties & AMD_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32) {
55675591
if (!isGFX10Plus())

llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1868,12 +1868,16 @@ MCDisassembler::DecodeStatus AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC1(
18681868
if (FourByteBuffer & COMPUTE_PGM_RSRC1_PRIV)
18691869
return MCDisassembler::Fail;
18701870

1871-
PRINT_DIRECTIVE(".amdhsa_dx10_clamp", COMPUTE_PGM_RSRC1_ENABLE_DX10_CLAMP);
1871+
if (!isGFX12Plus())
1872+
PRINT_DIRECTIVE(".amdhsa_dx10_clamp",
1873+
COMPUTE_PGM_RSRC1_GFX6_GFX11_ENABLE_DX10_CLAMP);
18721874

18731875
if (FourByteBuffer & COMPUTE_PGM_RSRC1_DEBUG_MODE)
18741876
return MCDisassembler::Fail;
18751877

1876-
PRINT_DIRECTIVE(".amdhsa_ieee_mode", COMPUTE_PGM_RSRC1_ENABLE_IEEE_MODE);
1878+
if (!isGFX12Plus())
1879+
PRINT_DIRECTIVE(".amdhsa_ieee_mode",
1880+
COMPUTE_PGM_RSRC1_GFX6_GFX11_ENABLE_IEEE_MODE);
18771881

18781882
if (FourByteBuffer & COMPUTE_PGM_RSRC1_BULKY)
18791883
return MCDisassembler::Fail;
@@ -1899,6 +1903,11 @@ MCDisassembler::DecodeStatus AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC1(
18991903
PRINT_DIRECTIVE(".amdhsa_memory_ordered", COMPUTE_PGM_RSRC1_GFX10_PLUS_MEM_ORDERED);
19001904
PRINT_DIRECTIVE(".amdhsa_forward_progress", COMPUTE_PGM_RSRC1_GFX10_PLUS_FWD_PROGRESS);
19011905
}
1906+
1907+
if (isGFX12Plus())
1908+
PRINT_DIRECTIVE(".amdhsa_round_robin_scheduling",
1909+
COMPUTE_PGM_RSRC1_GFX12_PLUS_ENABLE_WG_RR_EN);
1910+
19021911
return MCDisassembler::Success;
19031912
}
19041913

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1212,6 +1212,15 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
12121212
// \returns true is CSUB atomics support a no-return form.
12131213
bool hasAtomicCSubNoRtnInsts() const { return HasAtomicCSubNoRtnInsts; }
12141214

1215+
// \returns true if the target has DX10_CLAMP kernel descriptor mode bit
1216+
bool hasDX10ClampMode() const { return getGeneration() < GFX12; }
1217+
1218+
// \returns true if the target has IEEE kernel descriptor mode bit
1219+
bool hasIEEEMode() const { return getGeneration() < GFX12; }
1220+
1221+
// \returns true if the target has WG_RR_MODE kernel descriptor mode bit
1222+
bool hasRrWGMode() const { return getGeneration() >= GFX12; }
1223+
12151224
/// \returns SGPR allocation granularity supported by the subtarget.
12161225
unsigned getSGPRAllocGranule() const {
12171226
return AMDGPU::IsaInfo::getSGPRAllocGranule(this);

0 commit comments

Comments
 (0)