@@ -346,6 +346,39 @@ void GeneticTunerHarness::doCompile(
346
346
}
347
347
}
348
348
349
+ namespace {
350
+ std::vector<const DLTensor*> toConstDlpackTensors (
351
+ const std::vector<DLTensor*>& v) {
352
+ std::vector<const DLTensor*> out (v.begin (), v.end ());
353
+ return out;
354
+ }
355
+ } // namespace
356
+
357
+ template <typename ExecutorType>
358
+ std::vector<Duration> retrieveCachedRuntimes (
359
+ ExecutorType& engine,
360
+ const std::string& id,
361
+ const std::vector<const DLTensor*>& inputs,
362
+ const std::vector<DLTensor*>& outputs,
363
+ const CudaMappingOptions& options) {
364
+ if (not OptionsCache::cacheEnabled ()) {
365
+ return {};
366
+ }
367
+ auto cache = OptionsCache::getCache ();
368
+ auto allResults = cache->retrieveOptionsAndRuntimes (
369
+ id, inputs, toConstDlpackTensors (outputs));
370
+ auto wantedResult = std::find_if (
371
+ allResults.begin (),
372
+ allResults.end (),
373
+ [&options](const OptionsCache::RetrievalResult& r) {
374
+ return r.options == options;
375
+ });
376
+ if (wantedResult == allResults.end ()) {
377
+ return {};
378
+ }
379
+ return wantedResult->recordedRuntimes ;
380
+ }
381
+
349
382
template <typename ExecutorType, typename Population>
350
383
void GeneticTunerHarness::doGpuWork (
351
384
size_t gpu,
@@ -399,51 +432,56 @@ void GeneticTunerHarness::doGpuWork(
399
432
LOG_LINE_BY_LINE (INFO, ssInfo);
400
433
}
401
434
402
- std::vector<Duration> runtimes;
403
- try {
404
- size_t bestTimeSoFar;
405
- {
406
- std::lock_guard<std::mutex> lock (bestTimeMtx_);
407
- bestTimeSoFar = bestTime_;
408
- }
409
- auto prune =
410
- warmupOrPrune (engine, outputs, inputs, handle, bestTimeSoFar);
411
- if (prune) {
435
+ auto runtimes =
436
+ retrieveCachedRuntimes (engine, kKernelName_ , inputs, outputs, options);
437
+ if (runtimes.empty ()) {
438
+ try {
439
+ size_t bestTimeSoFar;
440
+ {
441
+ std::lock_guard<std::mutex> lock (bestTimeMtx_);
442
+ bestTimeSoFar = bestTime_;
443
+ }
444
+ auto prune =
445
+ warmupOrPrune (engine, outputs, inputs, handle, bestTimeSoFar);
446
+ if (prune) {
447
+ pConf->invalid = true ;
448
+ continue ;
449
+ } else {
450
+ runtimes.reserve (kReducedBenchmarkIterations );
451
+ for (size_t i = 0 ; i < kReducedBenchmarkIterations ; ++i) {
452
+ runtimes.push_back (engine.run (handle, inputs, outputs, true ));
453
+ }
454
+ engine.clear (handle);
455
+ }
456
+ } catch (std::exception& e) {
457
+ if (FLAGS_debug_tuner) {
458
+ LOG (WARNING) << " Runtime error gpu " << gpu << " : " << e.what ();
459
+ std::stringstream ssWarning;
460
+ CudaMappingOptionsCppPrinter warningPrinter (ssWarning);
461
+ warningPrinter << options;
462
+ LOG (WARNING) << " Aborted execution on gpu " << gpu;
463
+ LOG_LINE_BY_LINE (WARNING, ssWarning);
464
+ }
465
+ while (cudaGetLastError () != cudaSuccess) {
466
+ // In case of errors in the generated, we cannot rely on deviceReset
467
+ // to set the GPU in a clean state. So instead we just pop and discard
468
+ // all the errors accumulated on the GPU until we get to a clean slate
469
+ // (i.e. cudaSuccess).
470
+ ;
471
+ }
472
+ try {
473
+ // Some errors, such as illegal memory access, cannot be recovered
474
+ // from without a cudaDeviceReset (i.e. because user protection) In
475
+ // those cases we have no choice than to fail hard.
476
+ TC_CUDA_RUNTIMEAPI_ENFORCE (cudaDeviceSynchronize ());
477
+ } catch (const std::exception& e) {
478
+ LOG (FATAL) << " [CUDA][FATAL] cuda error on gpu " << gpu << " : "
479
+ << e.what () << " \n "
480
+ << CudaMappingOptionsAsCpp (options);
481
+ }
412
482
pConf->invalid = true ;
413
483
continue ;
414
- } else {
415
- runtimes.reserve (kReducedBenchmarkIterations );
416
- for (size_t i = 0 ; i < kReducedBenchmarkIterations ; ++i) {
417
- runtimes.push_back (engine.run (handle, inputs, outputs, true ));
418
- }
419
- engine.clear (handle);
420
484
}
421
- } catch (std::exception& e) {
422
- LOG (WARNING) << " Runtime error gpu " << gpu << " : " << e.what ();
423
- std::stringstream ssWarning;
424
- CudaMappingOptionsCppPrinter warningPrinter (ssWarning);
425
- warningPrinter << options;
426
- LOG (WARNING) << " Aborted execution on gpu " << gpu;
427
- LOG_LINE_BY_LINE (WARNING, ssWarning);
428
- while (cudaGetLastError () != cudaSuccess) {
429
- // In case of errors in the generated, we cannot rely on deviceReset to
430
- // set the GPU in a clean state. So instead we just pop and discard all
431
- // the errors accumulated on the GPU until we get to a clean slate
432
- // (i.e. cudaSuccess).
433
- ;
434
- }
435
- try {
436
- // Some errors, such as illegal memory access, cannot be recovered from
437
- // without a cudaDeviceReset (i.e. because user protection)
438
- // In those cases we have no choice than to fail hard.
439
- TC_CUDA_RUNTIMEAPI_ENFORCE (cudaDeviceSynchronize ());
440
- } catch (const std::exception& e) {
441
- LOG (FATAL) << " [CUDA][FATAL] cuda error on gpu " << gpu << " : "
442
- << e.what () << " \n "
443
- << CudaMappingOptionsAsCpp (options);
444
- }
445
- pConf->invalid = true ;
446
- continue ;
447
485
}
448
486
449
487
auto prof = median (runtimes);
0 commit comments