Addressed comments

Kewen12 · Kewen12 · commit 2107a33d1042 · 2025-02-14T21:56:55.000-06:00
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -884,11 +884,6 @@ struct AMDGPUKernelTy : public GenericKernelTy {
   /// Indicates whether or not we need to set up our own private segment size.
   bool usesDynamicStack() const { return DynamicStack; }
 
-  /// Get the execution mode of this kernel.
-  OMPTgtExecModeFlags getExecutionMode() const {
-    return getExecutionModeFlags();
-  }
-
   /// Envar to disable host-exec thread creation.
   BoolEnvar OMPX_DisableHostExec;
 
@@ -1717,7 +1712,7 @@ struct AMDGPUStreamTy {
     std::string KernelName;
     uint32_t NumTeams;
     uint32_t NumThreads;
-    KernelRunRecord *KernelRunRecords;
+    KernelRunRecordTy *KernelRunRecords;
   };
 
   using AMDGPUStreamCallbackTy = Error(void *Data);
@@ -2096,7 +2091,7 @@ struct AMDGPUStreamTy {
     PostKernelRunProcessingArgsTy *Args =
         reinterpret_cast<PostKernelRunProcessingArgsTy *>(Data);
 
-    KernelRunRecord *KernelRecord = Args->KernelRunRecords;
+    KernelRunRecordTy *KernelRecord = Args->KernelRunRecords;
     assert(KernelRecord && "KernelRunRecord is null!");
 
     uint64_t KernelDuration = getKernelDuration(Args);
@@ -2191,10 +2186,10 @@ struct AMDGPUStreamTy {
 
     // If runtime autotuning is enabled, setup the callback functions to process
     // the data after kernel completed.
-    if (Device.enableRuntimeAutotuning() &&
-        Kernel.getExecutionMode() == OMP_TGT_EXEC_MODE_SPMD) {
+    if (Device.enableRuntimeAutotuning() && Kernel.isSPMDMode()) {
       std::string KernelName(Kernel.getName());
-      KernelRunRecord *KernelRecords = Device.getKernelRunRecords();
+      KernelRunRecordTy *KernelRecords = Device.getKernelRunRecords();
+      assert(KernelRecords && "No KernelRecords!");
 
       // If this kernel has reached the run limit,
       // skip registering the callback function.
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -60,7 +60,7 @@ struct GenericPluginTy;
 struct GenericKernelTy;
 struct GenericDeviceTy;
 struct RecordReplayTy;
-struct KernelRunRecord;
+struct KernelRunRecordTy;
 
 /// Class that wraps the __tgt_async_info to simply its usage. In case the
 /// object is constructed without a valid __tgt_async_info, the object will use
@@ -1108,7 +1108,7 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
 
   bool getMultiDeviceKernelValue(void *EntryPtr);
 
-  KernelRunRecord *getKernelRunRecords() const { return KernelRunRecords; }
+  KernelRunRecordTy *getKernelRunRecords() const { return KernelRunRecords; }
 
   /// Return true if a descriptor of size 'Size' should be allocated using
   /// shared memory. Default implementation returns 'false',
@@ -1262,7 +1262,7 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   RPCServerTy *RPCServer;
 
   /// Structs for functions and data used in runtime autotuning.
-  KernelRunRecord *KernelRunRecords;
+  KernelRunRecordTy *KernelRunRecords;
 
 private:
 #ifdef OMPT_SUPPORT
@@ -1291,35 +1291,39 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
 };
 
 /// Struct represents the metadata for each kernel run on the device.
-struct KernelRunRecord {
+struct KernelRunRecordTy {
 
-  struct KernelRunEntry {
+  struct KernelRunEntryTy {
     std::string KernelName;
-    uint32_t NumTeams;
-    uint32_t NumThreads;
-    uint64_t RunDuration;
+    uint32_t NumTeams = 0;
+    uint32_t NumThreads = 0;
+    uint64_t RunDuration = 0;
   };
 
   // Metadata used in tuning process.
-  struct TuningMetadata {
+  struct TuningMetadataTy {
     uint32_t IdxThread = 0;
     uint32_t IdxCUMultiplier = 0;
     // Run counters.
     uint32_t RunCounters = 0;
     // Entry with minimum running time.
-    KernelRunEntry MinEntries;
+    KernelRunEntryTy MinEntry;
   };
 
   // Add a new entry
   void addEntry(std::string KernelName, uint32_t NumTeams, uint32_t NumThreads,
                 uint64_t RunDuration) {
-    KernelRunEntry NewRunEnry = {KernelName, NumTeams, NumThreads, RunDuration};
     TuningData[KernelName].RunCounters++;
 
     // Update min entries.
-    auto MinDuration = TuningData[KernelName].MinEntries.RunDuration;
+    uint64_t MinDuration = 0;
+    auto It = TuningData.find(KernelName);
+    if (It != TuningData.end()) {
+      MinDuration = It->second.MinEntry.RunDuration;
+    }
     if (MinDuration > RunDuration || MinDuration == 0) {
-      TuningData[KernelName].MinEntries = NewRunEnry;
+      TuningData[KernelName].MinEntry = {KernelName, NumTeams, NumThreads,
+                                         RunDuration};
     }
   }
 
@@ -1330,7 +1334,7 @@ struct KernelRunRecord {
     // If the kernel reaches the run limit,
     // return the current optimal launch parameters.
     if (reachedRunLimitForKernel(KernelName)) {
-      auto MinEntry = TuningData[KernelName].MinEntries;
+      auto MinEntry = TuningData[KernelName].MinEntry;
       return {MinEntry.NumTeams, MinEntry.NumThreads};
     }
 
@@ -1341,8 +1345,8 @@ struct KernelRunRecord {
     if (IdxCUMulti >= CUMultiplierCandidate.size()) {
       // No more element to search.
       // Return current optimal launch parameters.
-      return {TuningData[KernelName].MinEntries.NumTeams,
-              TuningData[KernelName].MinEntries.NumThreads};
+      return {TuningData[KernelName].MinEntry.NumTeams,
+              TuningData[KernelName].MinEntry.NumThreads};
     }
 
     // New team/thread pair for launch parameters.
@@ -1363,7 +1367,7 @@ struct KernelRunRecord {
   }
 
   bool reachedRunLimitForKernel(std::string KernelName) {
-    if (TuningData.count(KernelName) == 0) {
+    if (TuningData.find(KernelName) == TuningData.end()) {
       // If no record for this kernel.
       return false;
     }
@@ -1372,7 +1376,7 @@ struct KernelRunRecord {
   }
 
   uint32_t getRunCounterForKernel(std::string KernelName) {
-    if (TuningData.count(KernelName) == 0) {
+    if (TuningData.find(KernelName) == TuningData.end()) {
       return 0;
     }
 
@@ -1386,7 +1390,7 @@ struct KernelRunRecord {
   // The max number of tuning runs for each kernel.
   uint32_t RunLimiter = ThreadCandidate.size() * CUMultiplierCandidate.size();
   // Used for keeping track of the metatdata used in tuning for each kernel.
-  std::unordered_map<std::string, TuningMetadata> TuningData;
+  std::unordered_map<std::string, TuningMetadataTy> TuningData;
 };
 
 /// Class implementing common functionalities of offload plugins. Each plugin
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -727,7 +727,7 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
                            KernelArgs.NumTeams[2]};
 
   std::string KernelName = getName();
-  KernelRunRecord *KernelRecord = GenericDevice.getKernelRunRecords();
+  KernelRunRecordTy *KernelRecord = GenericDevice.getKernelRunRecords();
   uint32_t KernelRunCounter = 0;
 
   if (KernelRecord) {
@@ -1033,7 +1033,7 @@ Error GenericDeviceTy::init(GenericPluginTy &Plugin) {
 
   // Allocate resources for autotuning if enabled.
   if (OMPX_EnableRuntimeAutotuning) {
-    KernelRunRecords = new KernelRunRecord();
+    KernelRunRecords = new KernelRunRecordTy();
   }
 
   return Plugin::success();