@@ -60,7 +60,7 @@ struct GenericPluginTy;
60
60
struct GenericKernelTy ;
61
61
struct GenericDeviceTy ;
62
62
struct RecordReplayTy ;
63
- struct KernelRunRecord ;
63
+ struct KernelRunRecordTy ;
64
64
65
65
// / Class that wraps the __tgt_async_info to simply its usage. In case the
66
66
// / object is constructed without a valid __tgt_async_info, the object will use
@@ -1108,7 +1108,7 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
1108
1108
1109
1109
bool getMultiDeviceKernelValue (void *EntryPtr);
1110
1110
1111
- KernelRunRecord *getKernelRunRecords () const { return KernelRunRecords; }
1111
+ KernelRunRecordTy *getKernelRunRecords () const { return KernelRunRecords; }
1112
1112
1113
1113
// / Return true if a descriptor of size 'Size' should be allocated using
1114
1114
// / shared memory. Default implementation returns 'false',
@@ -1262,7 +1262,7 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
1262
1262
RPCServerTy *RPCServer;
1263
1263
1264
1264
// / Structs for functions and data used in runtime autotuning.
1265
- KernelRunRecord *KernelRunRecords;
1265
+ KernelRunRecordTy *KernelRunRecords;
1266
1266
1267
1267
private:
1268
1268
#ifdef OMPT_SUPPORT
@@ -1291,35 +1291,39 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
1291
1291
};
1292
1292
1293
1293
// / Struct represents the metadata for each kernel run on the device.
1294
- struct KernelRunRecord {
1294
+ struct KernelRunRecordTy {
1295
1295
1296
- struct KernelRunEntry {
1296
+ struct KernelRunEntryTy {
1297
1297
std::string KernelName;
1298
- uint32_t NumTeams;
1299
- uint32_t NumThreads;
1300
- uint64_t RunDuration;
1298
+ uint32_t NumTeams = 0 ;
1299
+ uint32_t NumThreads = 0 ;
1300
+ uint64_t RunDuration = 0 ;
1301
1301
};
1302
1302
1303
1303
// Metadata used in tuning process.
1304
- struct TuningMetadata {
1304
+ struct TuningMetadataTy {
1305
1305
uint32_t IdxThread = 0 ;
1306
1306
uint32_t IdxCUMultiplier = 0 ;
1307
1307
// Run counters.
1308
1308
uint32_t RunCounters = 0 ;
1309
1309
// Entry with minimum running time.
1310
- KernelRunEntry MinEntries ;
1310
+ KernelRunEntryTy MinEntry ;
1311
1311
};
1312
1312
1313
1313
// Add a new entry
1314
1314
void addEntry (std::string KernelName, uint32_t NumTeams, uint32_t NumThreads,
1315
1315
uint64_t RunDuration) {
1316
- KernelRunEntry NewRunEnry = {KernelName, NumTeams, NumThreads, RunDuration};
1317
1316
TuningData[KernelName].RunCounters ++;
1318
1317
1319
1318
// Update min entries.
1320
- auto MinDuration = TuningData[KernelName].MinEntries .RunDuration ;
1319
+ uint64_t MinDuration = 0 ;
1320
+ auto It = TuningData.find (KernelName);
1321
+ if (It != TuningData.end ()) {
1322
+ MinDuration = It->second .MinEntry .RunDuration ;
1323
+ }
1321
1324
if (MinDuration > RunDuration || MinDuration == 0 ) {
1322
- TuningData[KernelName].MinEntries = NewRunEnry;
1325
+ TuningData[KernelName].MinEntry = {KernelName, NumTeams, NumThreads,
1326
+ RunDuration};
1323
1327
}
1324
1328
}
1325
1329
@@ -1330,7 +1334,7 @@ struct KernelRunRecord {
1330
1334
// If the kernel reaches the run limit,
1331
1335
// return the current optimal launch parameters.
1332
1336
if (reachedRunLimitForKernel (KernelName)) {
1333
- auto MinEntry = TuningData[KernelName].MinEntries ;
1337
+ auto MinEntry = TuningData[KernelName].MinEntry ;
1334
1338
return {MinEntry.NumTeams , MinEntry.NumThreads };
1335
1339
}
1336
1340
@@ -1341,8 +1345,8 @@ struct KernelRunRecord {
1341
1345
if (IdxCUMulti >= CUMultiplierCandidate.size ()) {
1342
1346
// No more element to search.
1343
1347
// Return current optimal launch parameters.
1344
- return {TuningData[KernelName].MinEntries .NumTeams ,
1345
- TuningData[KernelName].MinEntries .NumThreads };
1348
+ return {TuningData[KernelName].MinEntry .NumTeams ,
1349
+ TuningData[KernelName].MinEntry .NumThreads };
1346
1350
}
1347
1351
1348
1352
// New team/thread pair for launch parameters.
@@ -1363,7 +1367,7 @@ struct KernelRunRecord {
1363
1367
}
1364
1368
1365
1369
bool reachedRunLimitForKernel (std::string KernelName) {
1366
- if (TuningData.count (KernelName) == 0 ) {
1370
+ if (TuningData.find (KernelName) == TuningData. end () ) {
1367
1371
// If no record for this kernel.
1368
1372
return false ;
1369
1373
}
@@ -1372,7 +1376,7 @@ struct KernelRunRecord {
1372
1376
}
1373
1377
1374
1378
uint32_t getRunCounterForKernel (std::string KernelName) {
1375
- if (TuningData.count (KernelName) == 0 ) {
1379
+ if (TuningData.find (KernelName) == TuningData. end () ) {
1376
1380
return 0 ;
1377
1381
}
1378
1382
@@ -1386,7 +1390,7 @@ struct KernelRunRecord {
1386
1390
// The max number of tuning runs for each kernel.
1387
1391
uint32_t RunLimiter = ThreadCandidate.size() * CUMultiplierCandidate.size();
1388
1392
// Used for keeping track of the metatdata used in tuning for each kernel.
1389
- std::unordered_map<std::string, TuningMetadata > TuningData;
1393
+ std::unordered_map<std::string, TuningMetadataTy > TuningData;
1390
1394
};
1391
1395
1392
1396
// / Class implementing common functionalities of offload plugins. Each plugin
0 commit comments