@@ -1095,11 +1095,19 @@ struct AMDGPUKernelTy : public GenericKernelTy {
1095
1095
// Honor OMP_NUM_TEAMS environment variable for XteamReduction kernel
1096
1096
// type, if possible.
1097
1097
int32_t NumTeamsEnvVar = GenericDevice.getOMPNumTeams ();
1098
+ // CU mulitiplier from envar.
1099
+ uint32_t EnvarCUMultiplier = GenericDevice.getXTeamRedTeamsPerCU ();
1100
+ // Disabled if the value is 0.
1101
+ if (EnvarCUMultiplier == 0 ) {
1102
+ EnvarCUMultiplier = UINT_MAX;
1103
+ }
1098
1104
1099
1105
if (GenericDevice.isFastReductionEnabled ()) {
1100
1106
// When fast reduction is enabled, the number of teams is capped by
1101
1107
// the MaxCUMultiplier constant.
1102
- MaxNumGroups = DeviceNumCUs * llvm::omp::xteam_red::MaxCUMultiplier;
1108
+ MaxNumGroups =
1109
+ DeviceNumCUs * std::min (llvm::omp::xteam_red::MaxCUMultiplier,
1110
+ static_cast <int16_t >(EnvarCUMultiplier));
1103
1111
} else {
1104
1112
// When fast reduction is not enabled, the number of teams is capped
1105
1113
// by the metadata that clang CodeGen created. The number of teams
@@ -1110,7 +1118,7 @@ struct AMDGPUKernelTy : public GenericKernelTy {
1110
1118
// ConstWGSize is the block size that CodeGen used.
1111
1119
uint32_t CUMultiplier =
1112
1120
llvm::omp::xteam_red::getXteamRedCUMultiplier (ConstWGSize);
1113
- MaxNumGroups = DeviceNumCUs * CUMultiplier;
1121
+ MaxNumGroups = DeviceNumCUs * std::min ( CUMultiplier, EnvarCUMultiplier) ;
1114
1122
}
1115
1123
1116
1124
// If envar OMPX_XTEAMREDUCTION_OCCUPANCY_BASED_OPT is set and no
@@ -2915,6 +2923,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
2915
2923
" LIBOMPTARGET_AMDGPU_GENERIC_SPMD_TEAMS_PER_CU" , 6 ),
2916
2924
OMPX_BigJumpLoopTeamsPerCU (
2917
2925
" LIBOMPTARGET_AMDGPU_BIG_JUMP_LOOP_TEAMS_PER_CU" , 0 ),
2926
+ OMPX_XTeamRedTeamsPerCU (" LIBOMPTARGET_AMDGPU_XTEAM_RED_TEAMS_PER_CU" ,
2927
+ 0 ),
2918
2928
OMPX_BigJumpLoopMaxTotalTeams (
2919
2929
" LIBOMPTARGET_AMDGPU_BIG_JUMP_LOOP_MAX_TOTAL_TEAMS" , 1024 * 1024 ),
2920
2930
OMPX_LowTripCount (" LIBOMPTARGET_AMDGPU_LOW_TRIPCOUNT" , 9000 ),
@@ -2980,6 +2990,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
2980
2990
virtual uint32_t getOMPXBigJumpLoopTeamsPerCU () const override {
2981
2991
return OMPX_BigJumpLoopTeamsPerCU;
2982
2992
}
2993
+ virtual uint32_t getXTeamRedTeamsPerCU () const override {
2994
+ return OMPX_XTeamRedTeamsPerCU;
2995
+ }
2983
2996
virtual uint32_t getOMPXBigJumpLoopMaxTotalTeams () const override {
2984
2997
return OMPX_BigJumpLoopMaxTotalTeams;
2985
2998
}
@@ -4427,6 +4440,12 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
4427
4440
// / OMPX_BigJumpLoopTeamsPerCU * #CUs.
4428
4441
UInt32Envar OMPX_BigJumpLoopTeamsPerCU;
4429
4442
4443
+ // / Envar for controlling the number of teams relative to the number of
4444
+ // / compute units (CUs) for cross-team-reduction kernels. 0 indicates that
4445
+ // / this value is not specified. If non-zero, the number of teams =
4446
+ // / OMPX_XTeamRedTeamsPerCU * #CUs.
4447
+ UInt32Envar OMPX_XTeamRedTeamsPerCU;
4448
+
4430
4449
// / Envar controlling the maximum number of teams per device for
4431
4450
// / Big-Jump-Loop kernels.
4432
4451
UInt32Envar OMPX_BigJumpLoopMaxTotalTeams;
0 commit comments