@@ -313,6 +313,7 @@ void GeneticTunerHarness::doCompile(
313
313
if (current >= population.size ()) {
314
314
break ;
315
315
}
316
+
316
317
auto & pConf = population.at (current);
317
318
auto options = makeOptions (*pConf);
318
319
try {
@@ -433,7 +434,7 @@ void GeneticTunerHarness::doGpuWork(
433
434
LOG_LINE_BY_LINE (INFO, ssInfo);
434
435
}
435
436
436
- auto runtimes =
437
+ std::vector<Duration> runtimes =
437
438
retrieveCachedRuntimes (engine, kKernelName_ , inputs, outputs, options);
438
439
if (runtimes.empty ()) {
439
440
try {
@@ -527,59 +528,77 @@ void GeneticTunerHarness::runOneGeneration(size_t generation) {
527
528
528
529
auto setUpJobsAndRun = [&](GeneticSearch::Population& population,
529
530
const std::string& printerText) {
530
- // Initialize for this round
531
- currentCompilationJob_.store (0 );
532
- numEvaluations_.store (0 );
533
- readyToEvaluate_.resize (0 );
534
- for (size_t i = 0 ; i < population.size (); ++i) {
535
- readyToEvaluate_.emplace_back ();
536
- readyToEvaluate_[i].store (false );
537
- }
538
- Printer printer (
539
- printerText,
540
- readyToEvaluate_.size (),
541
- currentCompilationJob_,
542
- numEvaluations_);
543
- auto logGenerations = FLAGS_tuner_gen_log_generations;
544
- ScopeGuard sgPrinter ([logGenerations, &printer]() {
545
- printer.stop ();
546
- if (logGenerations) {
547
- printer.printAll ();
548
- }
549
- });
550
-
551
- // Just spawn and join new threads for each generation
552
- std::vector<std::thread> cpuCompilationThreads;
553
- cpuCompilationThreads.reserve (FLAGS_tuner_threads);
554
- ScopeGuard sgCompilationThreads ([&cpuCompilationThreads]() {
555
- for (auto & cpuCompilationThread : cpuCompilationThreads) {
556
- cpuCompilationThread.join ();
531
+ // Most candidates should have been evaluated during the previous
532
+ // generation's selection phase.
533
+ // There are two exceptions:
534
+ // 1) the 1st generation
535
+ // 2) too many invalid configurations were previously encounted and the
536
+ // valid ones were not enough to form a new generation.
537
+ auto firstNew = std::partition (
538
+ population.begin (),
539
+ population.end (),
540
+ [](const std::unique_ptr<CandidateConfiguration>& c) {
541
+ return c->runtime != Duration::zero ();
542
+ });
543
+ GeneticSearch::Population newCandidates (
544
+ std::distance (firstNew, population.end ()));
545
+ std::move (firstNew, population.end (), newCandidates.begin ());
546
+ {
547
+ // Initialize for this round
548
+ currentCompilationJob_.store (0 );
549
+ numEvaluations_.store (0 );
550
+ readyToEvaluate_.resize (0 );
551
+ for (size_t i = 0 ; i < newCandidates.size (); ++i) {
552
+ readyToEvaluate_.emplace_back ();
553
+ readyToEvaluate_[i].store (false );
557
554
}
558
- });
559
- for (int i = 0 ; i < FLAGS_tuner_threads; ++i) {
560
- cpuCompilationThreads.emplace_back ([this , &engine, &population]() {
561
- this ->doCompile (engine, population);
555
+ Printer printer (
556
+ printerText,
557
+ readyToEvaluate_.size (),
558
+ currentCompilationJob_,
559
+ numEvaluations_);
560
+ auto logGenerations = FLAGS_tuner_gen_log_generations;
561
+ ScopeGuard sgPrinter ([logGenerations, &printer]() {
562
+ printer.stop ();
563
+ if (logGenerations) {
564
+ printer.printAll ();
565
+ }
562
566
});
563
- }
564
567
565
- // Just spawn and join new threads for each generation
566
- std::vector<std::thread> gpuWorkerThreads;
567
- gpuWorkerThreads.reserve (gpus.size ());
568
- ScopeGuard sgGpuWorkerThreads ([&gpuWorkerThreads]() {
569
- for (auto & gpuWorkerThread : gpuWorkerThreads) {
570
- gpuWorkerThread.join ();
568
+ // Just spawn and join new threads for each generation
569
+ std::vector<std::thread> cpuCompilationThreads;
570
+ cpuCompilationThreads.reserve (FLAGS_tuner_threads);
571
+ ScopeGuard sgCompilationThreads ([&cpuCompilationThreads]() {
572
+ for (auto & cpuCompilationThread : cpuCompilationThreads) {
573
+ cpuCompilationThread.join ();
574
+ }
575
+ });
576
+ for (int i = 0 ; i < FLAGS_tuner_threads; ++i) {
577
+ cpuCompilationThreads.emplace_back ([this , &engine, &newCandidates]() {
578
+ this ->doCompile (engine, newCandidates);
579
+ });
580
+ }
581
+
582
+ // Just spawn and join new threads for each generation
583
+ std::vector<std::thread> gpuWorkerThreads;
584
+ gpuWorkerThreads.reserve (gpus.size ());
585
+ ScopeGuard sgGpuWorkerThreads ([&gpuWorkerThreads]() {
586
+ for (auto & gpuWorkerThread : gpuWorkerThreads) {
587
+ gpuWorkerThread.join ();
588
+ }
589
+ });
590
+ for (auto gpu : gpus) {
591
+ gpuWorkerThreads.emplace_back (
592
+ [this , gpu, &engine, &newCandidates, &printer]() {
593
+ this ->doGpuWork (gpu, engine, newCandidates, printer);
594
+ });
571
595
}
572
- });
573
- for (auto gpu : gpus) {
574
- gpuWorkerThreads.emplace_back (
575
- [this , gpu, &engine, &population, &printer]() {
576
- this ->doGpuWork (gpu, engine, population, printer);
577
- });
578
596
}
579
597
// At this point everything is synchronized because out of scope, done
598
+ std::move (newCandidates.begin (), newCandidates.end (), firstNew);
580
599
};
581
- std::cout << " Generation " << generation << std::endl;
582
- setUpJobsAndRun (tuner_->population , " Population " );
600
+ std::cout << " Generation " << generation << ' : ' << std::endl;
601
+ setUpJobsAndRun (tuner_->population , " New Candidates " );
583
602
tuner_->generateSelectionPool ();
584
603
setUpJobsAndRun (tuner_->selectionPool , " Selection Pool" );
585
604
tuner_->selectSurvivors ();
0 commit comments