@@ -314,6 +314,7 @@ void GeneticTunerHarness::doCompile(
314
314
if (current >= population.size ()) {
315
315
break ;
316
316
}
317
+
317
318
auto & pConf = population.at (current);
318
319
auto options = makeOptions (*pConf);
319
320
try {
@@ -432,7 +433,7 @@ void GeneticTunerHarness::doGpuWork(
432
433
LOG_LINE_BY_LINE (INFO, ssInfo);
433
434
}
434
435
435
- auto runtimes =
436
+ std::vector<Duration> runtimes =
436
437
retrieveCachedRuntimes (engine, kKernelName_ , inputs, outputs, options);
437
438
if (runtimes.empty ()) {
438
439
try {
@@ -526,59 +527,77 @@ void GeneticTunerHarness::runOneGeneration(size_t generation) {
526
527
527
528
auto setUpJobsAndRun = [&](GeneticSearch::Population& population,
528
529
const std::string& printerText) {
529
- // Initialize for this round
530
- currentCompilationJob_.store (0 );
531
- numEvaluations_.store (0 );
532
- readyToEvaluate_.resize (0 );
533
- for (size_t i = 0 ; i < population.size (); ++i) {
534
- readyToEvaluate_.emplace_back ();
535
- readyToEvaluate_[i].store (false );
536
- }
537
- Printer printer (
538
- printerText,
539
- readyToEvaluate_.size (),
540
- currentCompilationJob_,
541
- numEvaluations_);
542
- auto logGenerations = FLAGS_tuner_gen_log_generations;
543
- ScopeGuard sgPrinter ([logGenerations, &printer]() {
544
- printer.stop ();
545
- if (logGenerations) {
546
- printer.printAll ();
547
- }
548
- });
549
-
550
- // Just spawn and join new threads for each generation
551
- std::vector<std::thread> cpuCompilationThreads;
552
- cpuCompilationThreads.reserve (FLAGS_tuner_threads);
553
- ScopeGuard sgCompilationThreads ([&cpuCompilationThreads]() {
554
- for (auto & cpuCompilationThread : cpuCompilationThreads) {
555
- cpuCompilationThread.join ();
530
+ // Most candidates should have been evaluated during the previous
531
+ // generation's selection phase.
532
+ // There are two exceptions:
533
+ // 1) the 1st generation
534
+ // 2) too many invalid configurations were previously encounted and the
535
+ // valid ones were not enough to form a new generation.
536
+ auto firstNew = std::partition (
537
+ population.begin (),
538
+ population.end (),
539
+ [](const std::unique_ptr<CandidateConfiguration>& c) {
540
+ return c->runtime != Duration::zero ();
541
+ });
542
+ GeneticSearch::Population newCandidates (
543
+ std::distance (firstNew, population.end ()));
544
+ std::move (firstNew, population.end (), newCandidates.begin ());
545
+ {
546
+ // Initialize for this round
547
+ currentCompilationJob_.store (0 );
548
+ numEvaluations_.store (0 );
549
+ readyToEvaluate_.resize (0 );
550
+ for (size_t i = 0 ; i < newCandidates.size (); ++i) {
551
+ readyToEvaluate_.emplace_back ();
552
+ readyToEvaluate_[i].store (false );
556
553
}
557
- });
558
- for (int i = 0 ; i < FLAGS_tuner_threads; ++i) {
559
- cpuCompilationThreads.emplace_back ([this , &engine, &population]() {
560
- this ->doCompile (engine, population);
554
+ Printer printer (
555
+ printerText,
556
+ readyToEvaluate_.size (),
557
+ currentCompilationJob_,
558
+ numEvaluations_);
559
+ auto logGenerations = FLAGS_tuner_gen_log_generations;
560
+ ScopeGuard sgPrinter ([logGenerations, &printer]() {
561
+ printer.stop ();
562
+ if (logGenerations) {
563
+ printer.printAll ();
564
+ }
561
565
});
562
- }
563
566
564
- // Just spawn and join new threads for each generation
565
- std::vector<std::thread> gpuWorkerThreads;
566
- gpuWorkerThreads.reserve (gpus.size ());
567
- ScopeGuard sgGpuWorkerThreads ([&gpuWorkerThreads]() {
568
- for (auto & gpuWorkerThread : gpuWorkerThreads) {
569
- gpuWorkerThread.join ();
567
+ // Just spawn and join new threads for each generation
568
+ std::vector<std::thread> cpuCompilationThreads;
569
+ cpuCompilationThreads.reserve (FLAGS_tuner_threads);
570
+ ScopeGuard sgCompilationThreads ([&cpuCompilationThreads]() {
571
+ for (auto & cpuCompilationThread : cpuCompilationThreads) {
572
+ cpuCompilationThread.join ();
573
+ }
574
+ });
575
+ for (int i = 0 ; i < FLAGS_tuner_threads; ++i) {
576
+ cpuCompilationThreads.emplace_back ([this , &engine, &newCandidates]() {
577
+ this ->doCompile (engine, newCandidates);
578
+ });
579
+ }
580
+
581
+ // Just spawn and join new threads for each generation
582
+ std::vector<std::thread> gpuWorkerThreads;
583
+ gpuWorkerThreads.reserve (gpus.size ());
584
+ ScopeGuard sgGpuWorkerThreads ([&gpuWorkerThreads]() {
585
+ for (auto & gpuWorkerThread : gpuWorkerThreads) {
586
+ gpuWorkerThread.join ();
587
+ }
588
+ });
589
+ for (auto gpu : gpus) {
590
+ gpuWorkerThreads.emplace_back (
591
+ [this , gpu, &engine, &newCandidates, &printer]() {
592
+ this ->doGpuWork (gpu, engine, newCandidates, printer);
593
+ });
570
594
}
571
- });
572
- for (auto gpu : gpus) {
573
- gpuWorkerThreads.emplace_back (
574
- [this , gpu, &engine, &population, &printer]() {
575
- this ->doGpuWork (gpu, engine, population, printer);
576
- });
577
595
}
578
596
// At this point everything is synchronized because out of scope, done
597
+ std::move (newCandidates.begin (), newCandidates.end (), firstNew);
579
598
};
580
- std::cout << " Generation " << generation << std::endl;
581
- setUpJobsAndRun (tuner_->population , " Population " );
599
+ std::cout << " Generation " << generation << ' : ' << std::endl;
600
+ setUpJobsAndRun (tuner_->population , " New Candidates " );
582
601
tuner_->generateSelectionPool ();
583
602
setUpJobsAndRun (tuner_->selectionPool , " Selection Pool" );
584
603
tuner_->selectSurvivors ();
0 commit comments