Skip to content

Commit c02b0c9

Browse files
committed
[OpenMP][Offload][AMDGPU] Add envar for setting CU multiplier
This PR added a new envar for configuration of the CU multiplier for cross-team-reduction kernel (SGN: 8). Now, we can use the following envars for setting CU multipliers on different kernel types: LIBOMPTARGET_AMDGPU_TEAMS_PER_CU - SPMD LIBOMPTARGET_AMDGPU_GENERIC_SPMD_TEAMS_PER_CU - Generic SPMD LIBOMPTARGET_AMDGPU_BIG_JUMP_LOOP_TEAMS_PER_CU - Big-jump-loop LIBOMPTARGET_AMDGPU_XTEAM_RED_TEAMS_PER_CU - Cross-team-reduction
1 parent 1f8bb4a commit c02b0c9

File tree

2 files changed

+24
-2
lines changed

2 files changed

+24
-2
lines changed

offload/plugins-nextgen/amdgpu/src/rtl.cpp

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1095,11 +1095,19 @@ struct AMDGPUKernelTy : public GenericKernelTy {
10951095
// Honor OMP_NUM_TEAMS environment variable for XteamReduction kernel
10961096
// type, if possible.
10971097
int32_t NumTeamsEnvVar = GenericDevice.getOMPNumTeams();
1098+
// CU mulitiplier from envar.
1099+
uint32_t EnvarCUMultiplier = GenericDevice.getXTeamRedTeamsPerCU();
1100+
// Disabled if the value is 0.
1101+
if (EnvarCUMultiplier == 0) {
1102+
EnvarCUMultiplier = UINT_MAX;
1103+
}
10981104

10991105
if (GenericDevice.isFastReductionEnabled()) {
11001106
// When fast reduction is enabled, the number of teams is capped by
11011107
// the MaxCUMultiplier constant.
1102-
MaxNumGroups = DeviceNumCUs * llvm::omp::xteam_red::MaxCUMultiplier;
1108+
MaxNumGroups =
1109+
DeviceNumCUs * std::min(llvm::omp::xteam_red::MaxCUMultiplier,
1110+
static_cast<int16_t>(EnvarCUMultiplier));
11031111
} else {
11041112
// When fast reduction is not enabled, the number of teams is capped
11051113
// by the metadata that clang CodeGen created. The number of teams
@@ -1110,7 +1118,7 @@ struct AMDGPUKernelTy : public GenericKernelTy {
11101118
// ConstWGSize is the block size that CodeGen used.
11111119
uint32_t CUMultiplier =
11121120
llvm::omp::xteam_red::getXteamRedCUMultiplier(ConstWGSize);
1113-
MaxNumGroups = DeviceNumCUs * CUMultiplier;
1121+
MaxNumGroups = DeviceNumCUs * std::min(CUMultiplier, EnvarCUMultiplier);
11141122
}
11151123

11161124
// If envar OMPX_XTEAMREDUCTION_OCCUPANCY_BASED_OPT is set and no
@@ -2915,6 +2923,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
29152923
"LIBOMPTARGET_AMDGPU_GENERIC_SPMD_TEAMS_PER_CU", 6),
29162924
OMPX_BigJumpLoopTeamsPerCU(
29172925
"LIBOMPTARGET_AMDGPU_BIG_JUMP_LOOP_TEAMS_PER_CU", 0),
2926+
OMPX_XTeamRedTeamsPerCU("LIBOMPTARGET_AMDGPU_XTEAM_RED_TEAMS_PER_CU",
2927+
0),
29182928
OMPX_BigJumpLoopMaxTotalTeams(
29192929
"LIBOMPTARGET_AMDGPU_BIG_JUMP_LOOP_MAX_TOTAL_TEAMS", 1024 * 1024),
29202930
OMPX_LowTripCount("LIBOMPTARGET_AMDGPU_LOW_TRIPCOUNT", 9000),
@@ -2980,6 +2990,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
29802990
virtual uint32_t getOMPXBigJumpLoopTeamsPerCU() const override {
29812991
return OMPX_BigJumpLoopTeamsPerCU;
29822992
}
2993+
virtual uint32_t getXTeamRedTeamsPerCU() const override {
2994+
return OMPX_XTeamRedTeamsPerCU;
2995+
}
29832996
virtual uint32_t getOMPXBigJumpLoopMaxTotalTeams() const override {
29842997
return OMPX_BigJumpLoopMaxTotalTeams;
29852998
}
@@ -4427,6 +4440,12 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
44274440
/// OMPX_BigJumpLoopTeamsPerCU * #CUs.
44284441
UInt32Envar OMPX_BigJumpLoopTeamsPerCU;
44294442

4443+
/// Envar for controlling the number of teams relative to the number of
4444+
/// compute units (CUs) for cross-team-reduction kernels. 0 indicates that
4445+
/// this value is not specified. If non-zero, the number of teams =
4446+
/// OMPX_XTeamRedTeamsPerCU * #CUs.
4447+
UInt32Envar OMPX_XTeamRedTeamsPerCU;
4448+
44304449
/// Envar controlling the maximum number of teams per device for
44314450
/// Big-Jump-Loop kernels.
44324451
UInt32Envar OMPX_BigJumpLoopMaxTotalTeams;

offload/plugins-nextgen/common/include/PluginInterface.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1017,6 +1017,9 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
10171017
virtual uint32_t getOMPXBigJumpLoopTeamsPerCU() const {
10181018
llvm_unreachable("Unimplemented");
10191019
}
1020+
virtual uint32_t getXTeamRedTeamsPerCU() const {
1021+
llvm_unreachable("Unimplemented");
1022+
}
10201023
virtual uint32_t getOMPXBigJumpLoopMaxTotalTeams() const {
10211024
llvm_unreachable("Unimplemented");
10221025
}

0 commit comments

Comments
 (0)