@@ -313,6 +313,7 @@ void GeneticTunerHarness::doCompile(
313
313
if (current >= population.size ()) {
314
314
break ;
315
315
}
316
+
316
317
auto & pConf = population.at (current);
317
318
auto options = makeOptions (*pConf);
318
319
try {
@@ -431,7 +432,7 @@ void GeneticTunerHarness::doGpuWork(
431
432
LOG_LINE_BY_LINE (INFO, ssInfo);
432
433
}
433
434
434
- auto runtimes =
435
+ std::vector<Duration> runtimes =
435
436
retrieveCachedRuntimes (engine, kKernelName_ , inputs, outputs, options);
436
437
if (runtimes.empty ()) {
437
438
try {
@@ -525,59 +526,77 @@ void GeneticTunerHarness::runOneGeneration(size_t generation) {
525
526
526
527
auto setUpJobsAndRun = [&](GeneticSearch::Population& population,
527
528
const std::string& printerText) {
528
- // Initialize for this round
529
- currentCompilationJob_.store (0 );
530
- numEvaluations_.store (0 );
531
- readyToEvaluate_.resize (0 );
532
- for (size_t i = 0 ; i < population.size (); ++i) {
533
- readyToEvaluate_.emplace_back ();
534
- readyToEvaluate_[i].store (false );
535
- }
536
- Printer printer (
537
- printerText,
538
- readyToEvaluate_.size (),
539
- currentCompilationJob_,
540
- numEvaluations_);
541
- auto logGenerations = FLAGS_tuner_gen_log_generations;
542
- ScopeGuard sgPrinter ([logGenerations, &printer]() {
543
- printer.stop ();
544
- if (logGenerations) {
545
- printer.printAll ();
546
- }
547
- });
548
-
549
- // Just spawn and join new threads for each generation
550
- std::vector<std::thread> cpuCompilationThreads;
551
- cpuCompilationThreads.reserve (FLAGS_tuner_threads);
552
- ScopeGuard sgCompilationThreads ([&cpuCompilationThreads]() {
553
- for (auto & cpuCompilationThread : cpuCompilationThreads) {
554
- cpuCompilationThread.join ();
529
+ // Most candidates should have been evaluated during the previous
530
+ // generation's selection phase.
531
+ // There are two exceptions:
532
+ // 1) the 1st generation
533
+ // 2) too many invalid configurations were previously encounted and the
534
+ // valid ones were not enough to form a new generation.
535
+ auto firstNew = std::partition (
536
+ population.begin (),
537
+ population.end (),
538
+ [](const std::unique_ptr<CandidateConfiguration>& c) {
539
+ return c->runtime != Duration::zero ();
540
+ });
541
+ GeneticSearch::Population newCandidates (
542
+ std::distance (firstNew, population.end ()));
543
+ std::move (firstNew, population.end (), newCandidates.begin ());
544
+ {
545
+ // Initialize for this round
546
+ currentCompilationJob_.store (0 );
547
+ numEvaluations_.store (0 );
548
+ readyToEvaluate_.resize (0 );
549
+ for (size_t i = 0 ; i < newCandidates.size (); ++i) {
550
+ readyToEvaluate_.emplace_back ();
551
+ readyToEvaluate_[i].store (false );
555
552
}
556
- });
557
- for (int i = 0 ; i < FLAGS_tuner_threads; ++i) {
558
- cpuCompilationThreads.emplace_back ([this , &engine, &population]() {
559
- this ->doCompile (engine, population);
553
+ Printer printer (
554
+ printerText,
555
+ readyToEvaluate_.size (),
556
+ currentCompilationJob_,
557
+ numEvaluations_);
558
+ auto logGenerations = FLAGS_tuner_gen_log_generations;
559
+ ScopeGuard sgPrinter ([logGenerations, &printer]() {
560
+ printer.stop ();
561
+ if (logGenerations) {
562
+ printer.printAll ();
563
+ }
560
564
});
561
- }
562
565
563
- // Just spawn and join new threads for each generation
564
- std::vector<std::thread> gpuWorkerThreads;
565
- gpuWorkerThreads.reserve (gpus.size ());
566
- ScopeGuard sgGpuWorkerThreads ([&gpuWorkerThreads]() {
567
- for (auto & gpuWorkerThread : gpuWorkerThreads) {
568
- gpuWorkerThread.join ();
566
+ // Just spawn and join new threads for each generation
567
+ std::vector<std::thread> cpuCompilationThreads;
568
+ cpuCompilationThreads.reserve (FLAGS_tuner_threads);
569
+ ScopeGuard sgCompilationThreads ([&cpuCompilationThreads]() {
570
+ for (auto & cpuCompilationThread : cpuCompilationThreads) {
571
+ cpuCompilationThread.join ();
572
+ }
573
+ });
574
+ for (int i = 0 ; i < FLAGS_tuner_threads; ++i) {
575
+ cpuCompilationThreads.emplace_back ([this , &engine, &newCandidates]() {
576
+ this ->doCompile (engine, newCandidates);
577
+ });
578
+ }
579
+
580
+ // Just spawn and join new threads for each generation
581
+ std::vector<std::thread> gpuWorkerThreads;
582
+ gpuWorkerThreads.reserve (gpus.size ());
583
+ ScopeGuard sgGpuWorkerThreads ([&gpuWorkerThreads]() {
584
+ for (auto & gpuWorkerThread : gpuWorkerThreads) {
585
+ gpuWorkerThread.join ();
586
+ }
587
+ });
588
+ for (auto gpu : gpus) {
589
+ gpuWorkerThreads.emplace_back (
590
+ [this , gpu, &engine, &newCandidates, &printer]() {
591
+ this ->doGpuWork (gpu, engine, newCandidates, printer);
592
+ });
569
593
}
570
- });
571
- for (auto gpu : gpus) {
572
- gpuWorkerThreads.emplace_back (
573
- [this , gpu, &engine, &population, &printer]() {
574
- this ->doGpuWork (gpu, engine, population, printer);
575
- });
576
594
}
577
595
// At this point everything is synchronized because out of scope, done
596
+ std::move (newCandidates.begin (), newCandidates.end (), firstNew);
578
597
};
579
- std::cout << " Generation " << generation << std::endl;
580
- setUpJobsAndRun (tuner_->population , " Population " );
598
+ std::cout << " Generation " << generation << ' : ' << std::endl;
599
+ setUpJobsAndRun (tuner_->population , " New Candidates " );
581
600
tuner_->generateSelectionPool ();
582
601
setUpJobsAndRun (tuner_->selectionPool , " Selection Pool" );
583
602
tuner_->selectSurvivors ();
0 commit comments