Skip to content

Commit c70b8c1

Browse files
authored
[OpenMP][Offload][AMDGPU] Added tuning constraint for the number of threads (llvm#817)
2 parents f99745d + c7e6dce commit c70b8c1

File tree

3 files changed

+17
-3
lines changed

3 files changed

+17
-3
lines changed

offload/plugins-nextgen/amdgpu/src/rtl.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -853,6 +853,10 @@ struct AMDGPUKernelTy : public GenericKernelTy {
853853
/// Indicates whether or not we need to set up our own private segment size.
854854
bool usesDynamicStack() const { return DynamicStack; }
855855

856+
bool isValidBlockSize(uint32_t BlockSize) const override {
857+
return BlockSize <= ConstWGSize;
858+
}
859+
856860
/// Envar to enable occupancy-based optimization for SPMD kernel.
857861
BoolEnvar OMPX_SPMDOccupancyBasedOpt;
858862

offload/plugins-nextgen/common/include/PluginInterface.h

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -367,6 +367,9 @@ struct GenericKernelTy {
367367
return ExecutionMode == OMP_TGT_EXEC_MODE_XTEAM_RED;
368368
}
369369

370+
/// Indicate if the input block size is within the limit.
371+
virtual bool isValidBlockSize(uint32_t BlockSize) const { return true; }
372+
370373
protected:
371374
/// Get the execution mode name of the kernel.
372375
const char *getExecutionModeName() const {
@@ -1345,8 +1348,10 @@ struct KernelRunRecordTy {
13451348

13461349
// Get parameters for next kernel launch.
13471350
std::pair<uint32_t, uint32_t>
1348-
getLaunchParamsForKernel(std::string KernelName,
1351+
getLaunchParamsForKernel(const GenericKernelTy &Kernel,
13491352
GenericDeviceTy &GenericDevice) {
1353+
std::string KernelName = Kernel.getName();
1354+
13501355
// If the kernel reaches the run limit,
13511356
// return the current optimal launch parameters.
13521357
if (reachedRunLimitForKernel(KernelName)) {
@@ -1360,7 +1365,10 @@ struct KernelRunRecordTy {
13601365

13611366
if (IdxCUMulti >= CUMultiplierCandidate.size()) {
13621367
// No more element to search.
1368+
// Max run counter to stop further runs.
13631369
// Return current optimal launch parameters.
1370+
TuningData[KernelName].RunCounters = RunLimiter + 1;
1371+
13641372
return {TuningData[KernelName].MinEntry.NumTeams,
13651373
TuningData[KernelName].MinEntry.NumThreads};
13661374
}
@@ -1374,7 +1382,9 @@ struct KernelRunRecordTy {
13741382
IdxThread++;
13751383
TuningData[KernelName].IdxThread = IdxThread;
13761384

1377-
if (IdxThread >= ThreadCandidate.size()) {
1385+
// Threads should be within the limit.
1386+
if (IdxThread >= ThreadCandidate.size() ||
1387+
!Kernel.isValidBlockSize(ThreadCandidate[IdxThread])) {
13781388
TuningData[KernelName].IdxThread = 0;
13791389
TuningData[KernelName].IdxCUMultiplier++;
13801390
}

offload/plugins-nextgen/common/src/PluginInterface.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -740,7 +740,7 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
740740
"Autotuning is enabled, but KernelRunRecord is not initialized!");
741741

742742
auto [Teams, Threads] =
743-
KernelRecord->getLaunchParamsForKernel(KernelName, GenericDevice);
743+
KernelRecord->getLaunchParamsForKernel(*this, GenericDevice);
744744
NumBlocks[0] = Teams;
745745
NumThreads[0] = Threads;
746746
} else {

0 commit comments

Comments
 (0)