@@ -1095,11 +1095,18 @@ struct AMDGPUKernelTy : public GenericKernelTy {
1095
1095
// Honor OMP_NUM_TEAMS environment variable for XteamReduction kernel
1096
1096
// type, if possible.
1097
1097
int32_t NumTeamsEnvVar = GenericDevice.getOMPNumTeams ();
1098
+ // CU mulitiplier from envar.
1099
+ uint32_t EnvarCUMultiplier = GenericDevice.getXTeamRedTeamsPerCU ();
1098
1100
1099
1101
if (GenericDevice.isFastReductionEnabled ()) {
1100
1102
// When fast reduction is enabled, the number of teams is capped by
1101
1103
// the MaxCUMultiplier constant.
1102
- MaxNumGroups = DeviceNumCUs * llvm::omp::xteam_red::MaxCUMultiplier;
1104
+ // When envar is enabled, use it for computing MaxNumGroup.
1105
+ if (EnvarCUMultiplier > 0 )
1106
+ MaxNumGroups = DeviceNumCUs * EnvarCUMultiplier;
1107
+ else
1108
+ MaxNumGroups = DeviceNumCUs * llvm::omp::xteam_red::MaxCUMultiplier;
1109
+
1103
1110
} else {
1104
1111
// When fast reduction is not enabled, the number of teams is capped
1105
1112
// by the metadata that clang CodeGen created. The number of teams
@@ -1110,7 +1117,13 @@ struct AMDGPUKernelTy : public GenericKernelTy {
1110
1117
// ConstWGSize is the block size that CodeGen used.
1111
1118
uint32_t CUMultiplier =
1112
1119
llvm::omp::xteam_red::getXteamRedCUMultiplier (ConstWGSize);
1113
- MaxNumGroups = DeviceNumCUs * CUMultiplier;
1120
+
1121
+ if (EnvarCUMultiplier > 0 ) {
1122
+ MaxNumGroups =
1123
+ DeviceNumCUs * std::min (CUMultiplier, EnvarCUMultiplier);
1124
+ } else {
1125
+ MaxNumGroups = DeviceNumCUs * CUMultiplier;
1126
+ }
1114
1127
}
1115
1128
1116
1129
// If envar OMPX_XTEAMREDUCTION_OCCUPANCY_BASED_OPT is set and no
@@ -2915,6 +2928,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
2915
2928
" LIBOMPTARGET_AMDGPU_GENERIC_SPMD_TEAMS_PER_CU" , 6 ),
2916
2929
OMPX_BigJumpLoopTeamsPerCU (
2917
2930
" LIBOMPTARGET_AMDGPU_BIG_JUMP_LOOP_TEAMS_PER_CU" , 0 ),
2931
+ OMPX_XTeamRedTeamsPerCU (" LIBOMPTARGET_AMDGPU_XTEAM_RED_TEAMS_PER_CU" ,
2932
+ 0 ),
2918
2933
OMPX_BigJumpLoopMaxTotalTeams (
2919
2934
" LIBOMPTARGET_AMDGPU_BIG_JUMP_LOOP_MAX_TOTAL_TEAMS" , 1024 * 1024 ),
2920
2935
OMPX_LowTripCount (" LIBOMPTARGET_AMDGPU_LOW_TRIPCOUNT" , 9000 ),
@@ -2980,6 +2995,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
2980
2995
virtual uint32_t getOMPXBigJumpLoopTeamsPerCU () const override {
2981
2996
return OMPX_BigJumpLoopTeamsPerCU;
2982
2997
}
2998
+ virtual uint32_t getXTeamRedTeamsPerCU () const override {
2999
+ return OMPX_XTeamRedTeamsPerCU;
3000
+ }
2983
3001
virtual uint32_t getOMPXBigJumpLoopMaxTotalTeams () const override {
2984
3002
return OMPX_BigJumpLoopMaxTotalTeams;
2985
3003
}
@@ -4427,6 +4445,12 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
4427
4445
// / OMPX_BigJumpLoopTeamsPerCU * #CUs.
4428
4446
UInt32Envar OMPX_BigJumpLoopTeamsPerCU;
4429
4447
4448
+ // / Envar for controlling the number of teams relative to the number of
4449
+ // / compute units (CUs) for cross-team-reduction kernels. 0 indicates that
4450
+ // / this value is not specified. If non-zero, the number of teams =
4451
+ // / OMPX_XTeamRedTeamsPerCU * #CUs.
4452
+ UInt32Envar OMPX_XTeamRedTeamsPerCU;
4453
+
4430
4454
// / Envar controlling the maximum number of teams per device for
4431
4455
// / Big-Jump-Loop kernels.
4432
4456
UInt32Envar OMPX_BigJumpLoopMaxTotalTeams;
0 commit comments