[Offload][AMDGPU] Added support for runtime tuning

Kewen12 · Kewen12 · commit 99d9b078272c · 2025-02-13T17:44:58.000-06:00
This PR implemented the necessary structs and functions for runtime tuning. The initial tuning logic is fairly straightforward with hard-coded candidates and exhaustive iterations. We will contiune to improve it in following patches through further discussions and experiments.
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -1709,6 +1709,10 @@ struct AMDGPUStreamTy {
     hsa_agent_t Agent;
     AMDGPUSignalTy *Signal;
     double TicksToTime;
+    std::string KernelName;
+    uint32_t NumTeams;
+    uint32_t NumThreads;
+    KernelRunRecord *KernelRunRecords;
   };
 
   using AMDGPUStreamCallbackTy = Error(void *Data);
@@ -2087,9 +2091,20 @@ struct AMDGPUStreamTy {
     PostKernelRunProcessingArgsTy *Args =
         reinterpret_cast<PostKernelRunProcessingArgsTy *>(Data);
 
-    uint64_t KernelDuration = getKernelDuration(Args);
-    fprintf(stderr, "Kernel Duration: %lu ns\n", KernelDuration);
+    KernelRunRecord *KernelRecord = Args->KernelRunRecords;
+    assert(KernelRecord && "KernelRunRecord is null!");
 
+    uint64_t KernelDuration = getKernelDuration(Args);
+    KernelRecord->addEntry(Args->KernelName, Args->NumTeams, Args->NumThreads,
+                           KernelDuration);
+
+    if (getInfoLevel() & OMP_INFOTYPE_AMD_KERNEL_TRACE) {
+      fprintf(stderr,
+              "[Autotuning run] Kernel %s with %u teams and %u threads "
+              "completed in %lu ns.\n",
+              Args->KernelName.c_str(), Args->NumTeams, Args->NumThreads,
+              KernelDuration);
+    }
     return Plugin::success();
   }
 
@@ -2172,13 +2187,24 @@ struct AMDGPUStreamTy {
     // If runtime autotuning is enabled, setup the callback functions to process
     // the data after kernel completed.
     if (Device.enableRuntimeAutotuning()) {
-      PostKernelRunProcessingArgs.Agent = Agent;
-      PostKernelRunProcessingArgs.Signal = OutputSignal;
-      PostKernelRunProcessingArgs.TicksToTime = 1.0;
-
-      if (auto Err = Slots[Curr].schedCallback(postKernelRunProcessingAction,
-                                               &PostKernelRunProcessingArgs))
-        return Err;
+      std::string KernelName(Kernel.getName());
+      KernelRunRecord *KernelRecords = Device.getKernelRunRecords();
+
+      // If this kernel has reached the run limit,
+      // skip registering the callback function.
+      if (!KernelRecords->reachedRunLimitForKernel(KernelName)) {
+        PostKernelRunProcessingArgs.Agent = Agent;
+        PostKernelRunProcessingArgs.Signal = OutputSignal;
+        PostKernelRunProcessingArgs.TicksToTime = 1.0;
+        PostKernelRunProcessingArgs.KernelName = KernelName;
+        PostKernelRunProcessingArgs.NumTeams = NumBlocks[0];
+        PostKernelRunProcessingArgs.NumThreads = NumThreads[0];
+        PostKernelRunProcessingArgs.KernelRunRecords = KernelRecords;
+
+        if (auto Err = Slots[Curr].schedCallback(postKernelRunProcessingAction,
+                                                 &PostKernelRunProcessingArgs))
+          return Err;
+      }
     }
 
     // Push the kernel with the output signal and an input signal (optional)
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -17,6 +17,8 @@
 #include <list>
 #include <map>
 #include <shared_mutex>
+#include <unordered_map>
+#include <unordered_set>
 #include <vector>
 
 #include "ExclusiveAccess.h"
@@ -58,6 +60,7 @@ struct GenericPluginTy;
 struct GenericKernelTy;
 struct GenericDeviceTy;
 struct RecordReplayTy;
+struct KernelRunRecord;
 
 /// Class that wraps the __tgt_async_info to simply its usage. In case the
 /// object is constructed without a valid __tgt_async_info, the object will use
@@ -1105,6 +1108,8 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
 
   bool getMultiDeviceKernelValue(void *EntryPtr);
 
+  KernelRunRecord *getKernelRunRecords() const { return KernelRunRecords; }
+
   /// Return true if a descriptor of size 'Size' should be allocated using
   /// shared memory. Default implementation returns 'false',
   virtual bool useSharedMemForDescriptor(int64_t Size);
@@ -1256,6 +1261,9 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   /// This is used to run the RPC server during task synchronization.
   RPCServerTy *RPCServer;
 
+  /// Structs for functions and data used in runtime autotuning.
+  KernelRunRecord *KernelRunRecords;
+
 private:
 #ifdef OMPT_SUPPORT
   /// OMPT callback functions
@@ -1282,6 +1290,99 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   bool IsFastReductionEnabled = false;
 };
 
+/// Struct represents the metadata for each kernel run on the device.
+struct KernelRunRecord {
+
+  struct KernelRunEntry {
+    std::string KernelName;
+    uint32_t NumTeams;
+    uint32_t NumThreads;
+    uint64_t RunDuration;
+  };
+
+  // Metadata used in tuning process.
+  struct TuningMetadata {
+    uint32_t IdxThread = 0;
+    uint32_t IdxCUMultiplier = 0;
+    // Tuning history.
+    std::vector<KernelRunEntry> RunEntries;
+    // Run counters.
+    uint32_t RunCounters;
+    // Entry with minimum running time.
+    KernelRunEntry MinEntries;
+  };
+
+  // Add a new entry
+  void addEntry(std::string KernelName, uint32_t NumTeams, uint32_t NumThreads,
+                uint64_t RunDuration) {
+    KernelRunEntry NewRunEnry = {KernelName, NumTeams, NumThreads, RunDuration};
+    TuningData[KernelName].RunEntries.push_back(NewRunEnry);
+    TuningData[KernelName].RunCounters++;
+
+    // Update min entries.
+    auto MinDuration = TuningData[KernelName].MinEntries.RunDuration;
+    if (MinDuration > RunDuration || MinDuration == 0) {
+      TuningData[KernelName].MinEntries = NewRunEnry;
+    }
+  }
+
+  // Get parameters for next kernel launch.
+  std::pair<uint32_t, uint32_t>
+  getLaunchParamsForKernel(std::string KernelName,
+                           GenericDeviceTy &GenericDevice) {
+    // If the kernel reaches the run limit,
+    // return the current optimal launch parameters.
+    if (reachedRunLimitForKernel(KernelName)) {
+      auto MinEntry = TuningData[KernelName].MinEntries;
+      return {MinEntry.NumTeams, MinEntry.NumThreads};
+    }
+
+    // Pick new launch parameters.
+    uint32_t IdxCUMulti = TuningData[KernelName].IdxCUMultiplier;
+    uint32_t IdxThread = TuningData[KernelName].IdxThread;
+
+    if (IdxCUMulti >= CUMultiplierCandidate.size()) {
+      // No more element to search.
+      // Return current optimal launch parameters.
+      return {TuningData[KernelName].MinEntries.NumTeams,
+              TuningData[KernelName].MinEntries.NumThreads};
+    }
+
+    // New team/thread pair for launch parameters.
+    uint32_t NumCU = GenericDevice.getNumComputeUnits();
+    std::pair<uint32_t, uint32_t> NewLaunchParams = {
+        CUMultiplierCandidate[IdxCUMulti] * NumCU, ThreadCandidate[IdxThread]};
+
+    // Update indices.
+    IdxThread++;
+    TuningData[KernelName].IdxThread = IdxThread;
+
+    if (IdxThread >= ThreadCandidate.size()) {
+      TuningData[KernelName].IdxThread = 0;
+      TuningData[KernelName].IdxCUMultiplier++;
+    }
+
+    return NewLaunchParams;
+  }
+
+  bool reachedRunLimitForKernel(std::string KernelName) {
+    return TuningData[KernelName].RunCounters > RunLimiter;
+  }
+
+  uint32_t getRunCounterForKernel(std::string KernelName) {
+    return TuningData[KernelName].RunCounters;
+  }
+
+private:
+  // Candidates for thread and team.
+  std::vector<uint32_t> ThreadCandidate = {32, 64, 128, 256, 512, 1024};
+  std::vector<uint32_t> CUMultiplierCandidate = {4, 8, 16, 32, 64, 128};
+  // The max number of tuning runs for each kernel.
+  uint32_t RunLimiter = ThreadCandidate.size() * CUMultiplierCandidate.size();
+  // Used for keeping track of the metatdata used in tuning for each kernel.
+  std::unordered_map<std::string, TuningMetadata> TuningData;
+};
+
 /// Class implementing common functionalities of offload plugins. Each plugin
 /// should define the specific plugin class, derive from this generic one, and
 /// implement the necessary virtual function members.
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -726,21 +726,39 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
   uint32_t NumBlocks[3] = {KernelArgs.NumTeams[0], KernelArgs.NumTeams[1],
                            KernelArgs.NumTeams[2]};
 
-  // TODO fix workaround since IsBareKernel is not properly set for legacy
-  // flang and specialized kernels since they don't use kernel-env. While
-  // we can check for specialized kernels, we can't for legacy flang. So,
-  // on amd-staging, all kernels including bare ones use this codepath.
-  NumThreads[0] = getNumThreads(GenericDevice, NumThreads);
-
-  std::pair<bool, uint32_t> AdjustInfo = adjustNumThreadsForLowTripCount(
-      GenericDevice, NumThreads[0], KernelArgs.Tripcount,
-      KernelArgs.ThreadLimit);
-  if (AdjustInfo.first)
-    NumThreads[0] = AdjustInfo.second;
-
-  NumBlocks[0] = getNumBlocks(GenericDevice, NumBlocks, KernelArgs.Tripcount,
-                              NumThreads[0], KernelArgs.ThreadLimit[0] > 0);
-  // }
+  std::string KernelName = getName();
+  KernelRunRecord *KernelRecord = GenericDevice.getKernelRunRecords();
+  uint32_t KernelRunCounter = 0;
+
+  if (KernelRecord) {
+    KernelRunCounter = KernelRecord->getRunCounterForKernel(KernelName);
+  }
+  // If Autotuning is enabled and the kernel is not launched for the first time.
+  if (GenericDevice.enableRuntimeAutotuning() && KernelRunCounter > 0) {
+    assert(KernelRecord &&
+           "Autotuning is enabled, but KernelRunRecord is not initialized!");
+
+    auto [Teams, Threads] =
+        KernelRecord->getLaunchParamsForKernel(KernelName, GenericDevice);
+    NumBlocks[0] = Teams;
+    NumThreads[0] = Threads;
+  } else {
+
+    // TODO fix workaround since IsBareKernel is not properly set for legacy
+    // flang and specialized kernels since they don't use kernel-env. While
+    // we can check for specialized kernels, we can't for legacy flang. So,
+    // on amd-staging, all kernels including bare ones use this codepath.
+    NumThreads[0] = getNumThreads(GenericDevice, NumThreads);
+
+    std::pair<bool, uint32_t> AdjustInfo = adjustNumThreadsForLowTripCount(
+        GenericDevice, NumThreads[0], KernelArgs.Tripcount,
+        KernelArgs.ThreadLimit);
+    if (AdjustInfo.first)
+      NumThreads[0] = AdjustInfo.second;
+
+    NumBlocks[0] = getNumBlocks(GenericDevice, NumBlocks, KernelArgs.Tripcount,
+                                NumThreads[0], KernelArgs.ThreadLimit[0] > 0);
+  }
 
   // Record the kernel description after we modified the argument count and num
   // blocks/threads.
@@ -930,7 +948,7 @@ GenericDeviceTy::GenericDeviceTy(GenericPluginTy &Plugin, int32_t DeviceId,
       OMPX_EnableRuntimeAutotuning("OMPX_ENABLE_RUNTIME_AUTOTUNING", false),
       DeviceId(DeviceId), GridValues(OMPGridValues),
       PeerAccesses(NumDevices, PeerAccessState::PENDING), PeerAccessesLock(),
-      PinnedAllocs(*this), RPCServer(nullptr) {
+      PinnedAllocs(*this), RPCServer(nullptr), KernelRunRecords(nullptr) {
 #ifdef OMPT_SUPPORT
   OmptInitialized.store(false);
   // Bind the callbacks to this device's member functions
@@ -1012,6 +1030,11 @@ Error GenericDeviceTy::init(GenericPluginTy &Plugin) {
     MemoryManager = new MemoryManagerTy(*this, ThresholdMM);
   }
 
+  // Allocate resources for autotuning if enabled.
+  if (OMPX_EnableRuntimeAutotuning) {
+    KernelRunRecords = new KernelRunRecord();
+  }
+
   return Plugin::success();
 }
 
@@ -1084,6 +1107,13 @@ Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) {
     if (auto Err = RPCServer->deinitDevice(*this))
       return Err;
 
+  // Delete autotuning related resources if the option is on.
+  if (OMPX_EnableRuntimeAutotuning) {
+    if (KernelRunRecords)
+      delete KernelRunRecords;
+    KernelRunRecords = nullptr;
+  }
+
 #ifdef OMPT_SUPPORT
   if (ompt::Initialized) {
     bool ExpectedStatus = true;