[OpenMP][Offload][AMDGPU] Added tuning constraint for the number of threads (llvm#817)

ronlieb · web-flow · commit c70b8c18ffd7 · 2025-03-05T14:22:29.000-05:00
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -853,6 +853,10 @@ struct AMDGPUKernelTy : public GenericKernelTy {
   /// Indicates whether or not we need to set up our own private segment size.
   bool usesDynamicStack() const { return DynamicStack; }
 
+  bool isValidBlockSize(uint32_t BlockSize) const override {
+    return BlockSize <= ConstWGSize;
+  }
+
   /// Envar to enable occupancy-based optimization for SPMD kernel.
   BoolEnvar OMPX_SPMDOccupancyBasedOpt;
 
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -367,6 +367,9 @@ struct GenericKernelTy {
     return ExecutionMode == OMP_TGT_EXEC_MODE_XTEAM_RED;
   }
 
+  /// Indicate if the input block size is within the limit.
+  virtual bool isValidBlockSize(uint32_t BlockSize) const { return true; }
+
 protected:
   /// Get the execution mode name of the kernel.
   const char *getExecutionModeName() const {
@@ -1345,8 +1348,10 @@ struct KernelRunRecordTy {
 
   // Get parameters for next kernel launch.
   std::pair<uint32_t, uint32_t>
-  getLaunchParamsForKernel(std::string KernelName,
+  getLaunchParamsForKernel(const GenericKernelTy &Kernel,
                            GenericDeviceTy &GenericDevice) {
+    std::string KernelName = Kernel.getName();
+
     // If the kernel reaches the run limit,
     // return the current optimal launch parameters.
     if (reachedRunLimitForKernel(KernelName)) {
@@ -1360,7 +1365,10 @@ struct KernelRunRecordTy {
 
     if (IdxCUMulti >= CUMultiplierCandidate.size()) {
       // No more element to search.
+      // Max run counter to stop further runs.
       // Return current optimal launch parameters.
+      TuningData[KernelName].RunCounters = RunLimiter + 1;
+
       return {TuningData[KernelName].MinEntry.NumTeams,
               TuningData[KernelName].MinEntry.NumThreads};
     }
@@ -1374,7 +1382,9 @@ struct KernelRunRecordTy {
     IdxThread++;
     TuningData[KernelName].IdxThread = IdxThread;
 
-    if (IdxThread >= ThreadCandidate.size()) {
+    // Threads should be within the limit.
+    if (IdxThread >= ThreadCandidate.size() ||
+        !Kernel.isValidBlockSize(ThreadCandidate[IdxThread])) {
       TuningData[KernelName].IdxThread = 0;
       TuningData[KernelName].IdxCUMultiplier++;
     }
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -740,7 +740,7 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
            "Autotuning is enabled, but KernelRunRecord is not initialized!");
 
     auto [Teams, Threads] =
-        KernelRecord->getLaunchParamsForKernel(KernelName, GenericDevice);
+        KernelRecord->getLaunchParamsForKernel(*this, GenericDevice);
     NumBlocks[0] = Teams;
     NumThreads[0] = Threads;
   } else {