From 4c3f91532fb92aeb2f0a6c0e42b3430b0130cab5 Mon Sep 17 00:00:00 2001
From: Theodoros Theodoridis <theodoridisgr@gmail.com>
Date: Thu, 15 Mar 2018 10:14:40 +0100
Subject: [PATCH 1/9] [genetic search] Fix bug that disabled crossover

---
 tc/autotuner/genetic_search.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/tc/autotuner/genetic_search.cc b/tc/autotuner/genetic_search.cc
index c8cc01127..ff11be1b1 100644
--- a/tc/autotuner/genetic_search.cc
+++ b/tc/autotuner/genetic_search.cc
@@ -250,9 +250,9 @@ void GeneticSearch::breed() {
   auto accFitness = computeAccumulatedFitness(population);
   Population new_population;
   new_population.reserve(kMaxPopulationSize);
-  for (auto& p : population) {
+  for (size_t c = 0; c < kNumberElites; ++c) {
     new_population.push_back(
-        make_unique<CandidateConfiguration>(p->configuration));
+        make_unique<CandidateConfiguration>(population.at(c)->configuration));
   }
 
   auto select = [&]() -> TuningConfiguration& {

From b6c33eed3db16dca70a560d7833bdb2aaf905d3d Mon Sep 17 00:00:00 2001
From: Theodoros Theodoridis <theodoridisgr@gmail.com>
Date: Tue, 13 Mar 2018 11:12:37 +0100
Subject: [PATCH 2/9] [genetic search] Scale fitness before selection

Using an unscaled fitness value for selection is problematic:

-Outstanding individuals take over very quickly, this leads to
premature convergence.

-When fitness values are close together, very litle selection
pressure is applied and selection is almost uniformly random.
Having slightly better fitness does not improve an individual's
survival chances.

-Transposing the fitness function (e.g. adding a constant value)
changes the selection probabilities even though the location of the
optimum (and the "shape" of the fitness) remain unchanged.

Scaling the fitness function helps ameliorate those issues.

Sigma scaling is used:

fitness' = max(fitness - (mean_fitness - 2 * std_fitness), 0)
---
 tc/autotuner/genetic_search.cc | 31 ++++++++++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/tc/autotuner/genetic_search.cc b/tc/autotuner/genetic_search.cc
index ff11be1b1..61b9cb9a8 100644
--- a/tc/autotuner/genetic_search.cc
+++ b/tc/autotuner/genetic_search.cc
@@ -16,6 +16,8 @@
 
 #include "tc/autotuner/genetic_search.h"
 
+#include <algorithm>
+#include <numeric>
 #include <random>
 #include <sstream>
 
@@ -72,9 +74,35 @@ void mutate(
   }
 }
 
-void normalizeVector(std::vector<double>& v) {
+double mean(std::vector<double>& v) {
+  if (v.empty()) {
+    throw std::invalid_argument("Cannot compute the mean of an empty vector.");
+  }
   auto sum = std::accumulate(v.begin(), v.end(), 0.0);
+  return sum / v.size();
+}
+
+double stdv(std::vector<double>& v, double mean) {
+  std::vector<double> diffs(v.size());
+  std::transform(v.begin(), v.end(), diffs.begin(), [mean](double val) {
+    return val - mean;
+  });
+
+  auto squareSum =
+      std::inner_product(diffs.begin(), diffs.end(), diffs.begin(), 0.0);
+  return std::sqrt(squareSum / v.size());
+}
+
+void sigmaScale(std::vector<double>& v) {
+  auto m = mean(v);
+  auto s = stdv(v, m);
+  std::transform(v.begin(), v.end(), v.begin(), [m, s](double val) {
+    return std::max(val - (m - 2 * s), 0.0);
+  });
+}
 
+void normalizeVector(std::vector<double>& v) {
+  auto sum = std::accumulate(v.begin(), v.end(), 0.0);
   std::transform(
       v.begin(), v.end(), v.begin(), [sum](double v) { return v / sum; });
 }
@@ -92,6 +120,7 @@ std::vector<double> computeNormalizedFitness(
             std::chrono::duration_cast<std::chrono::microseconds>(c->runtime)
                 .count();
       });
+  sigmaScale(fitness);
   normalizeVector(fitness);
   return fitness;
 }

From 5052f83d7ae327252d87638b483ef28c373aed7c Mon Sep 17 00:00:00 2001
From: Theodoros Theodoridis <theodoridisgr@gmail.com>
Date: Thu, 15 Mar 2018 10:15:34 +0100
Subject: [PATCH 3/9] [genetic search] Switch to Stochastic Universal Sampling

Stochastic Universal Sampling is an improvement upon the roulette
algorithm that was previously used
---
 tc/autotuner/genetic_search.cc | 34 +++++++++++++++++++++++++++++-----
 tc/autotuner/genetic_search.h  |  3 +++
 2 files changed, 32 insertions(+), 5 deletions(-)

diff --git a/tc/autotuner/genetic_search.cc b/tc/autotuner/genetic_search.cc
index 61b9cb9a8..bfa583869 100644
--- a/tc/autotuner/genetic_search.cc
+++ b/tc/autotuner/genetic_search.cc
@@ -193,6 +193,7 @@ GeneticSearch::GeneticSearch(
     : population(),
       lastBestConf(confs[0]),
       kMaxPopulationSize(n),
+      kMatingPoolSize(n * 3),
       kCrossOverRate(crossOverRate),
       kMutationRate(mutationRate),
       kNumberElites(numberElites),
@@ -225,6 +226,7 @@ GeneticSearch::GeneticSearch(
     : population(),
       lastBestConf(conf),
       kMaxPopulationSize(n),
+      kMatingPoolSize(n * 3),
       kCrossOverRate(crossOverRate),
       kMutationRate(mutationRate),
       kNumberElites(numberElites),
@@ -275,19 +277,41 @@ TuningConfiguration GeneticSearch::crossover(
   return a;
 }
 
+std::vector<TuningConfiguration> GeneticSearch::stochasticUniversalSampling(
+    const std::vector<double>& fitness) const {
+  std::vector<TuningConfiguration> matingPool;
+  matingPool.reserve(kMatingPoolSize);
+
+  auto r =
+      std::uniform_real_distribution<double>(0, 1.0 / kMatingPoolSize)(rng);
+  size_t count = 0;
+  size_t i = 0;
+  while (count < kMatingPoolSize) {
+    while (r <= fitness[i]) {
+      matingPool.push_back(population[i]->configuration);
+      r += 1.0 / kMatingPoolSize;
+      ++count;
+    }
+    ++i;
+  }
+  return matingPool;
+}
+
 void GeneticSearch::breed() {
-  auto accFitness = computeAccumulatedFitness(population);
+  auto matingPool =
+      stochasticUniversalSampling(computeAccumulatedFitness(population));
+
   Population new_population;
-  new_population.reserve(kMaxPopulationSize);
+  new_population.reserve(kMatingPoolSize);
   for (size_t c = 0; c < kNumberElites; ++c) {
     new_population.push_back(
         make_unique<CandidateConfiguration>(population.at(c)->configuration));
   }
 
   auto select = [&]() -> TuningConfiguration& {
-    auto limit = std::uniform_real_distribution<double>{}(rng);
-    auto lb = std::lower_bound(accFitness.begin(), accFitness.end(), limit);
-    return population.at(std::distance(accFitness.begin(), lb))->configuration;
+    auto idx = std::uniform_int_distribution<size_t>{
+        size_t(0), matingPool.size() - 1}(rng);
+    return matingPool.at(idx);
   };
   auto shouldCrossOver = [&]() -> bool {
     /*
diff --git a/tc/autotuner/genetic_search.h b/tc/autotuner/genetic_search.h
index 048670211..420891f6d 100644
--- a/tc/autotuner/genetic_search.h
+++ b/tc/autotuner/genetic_search.h
@@ -97,6 +97,8 @@ class GeneticSearch {
   void updateParameters();
 
  private:
+  std::vector<TuningConfiguration> stochasticUniversalSampling(
+      const std::vector<double>& fitness) const;
   void breed();
 
   TuningConfiguration crossover(
@@ -113,6 +115,7 @@ class GeneticSearch {
   Population population;
   TuningConfiguration lastBestConf;
   const size_t kMaxPopulationSize;
+  const size_t kMatingPoolSize;
   const uint8_t kCrossOverRate;
   const uint8_t kMutationRate;
   const size_t kNumberElites;

From a83b465a3c186005ffc7386dfcf2daa34d347e5f Mon Sep 17 00:00:00 2001
From: Theodoros Theodoridis <theodoridisgr@gmail.com>
Date: Fri, 16 Mar 2018 11:37:48 +0100
Subject: [PATCH 4/9] [genetic search] Switch to (mu,lamda) selection

Previously each generation had mu candidates and generated mu
children which all survided. This meant that really bad candidates
that were randomly generated would survive across generations.

With this change, lambda (typically larger thatn mu) children are
generated and the best mu survive. The previous behaviour is a
special case in which lambda = mu.
---
 .../pytorch_integration/autotuning_layers.rst |   7 +-
 .../tutorials/tutorial_tensordot_with_tc.rst  |   2 +-
 tc/autotuner/genetic_autotuner.cc             |   3 +-
 tc/autotuner/genetic_search.cc                | 121 +++++++++++-------
 tc/autotuner/genetic_search.h                 |  17 ++-
 tc/autotuner/genetic_tuning_harness.cc        |  59 +++++----
 tc/autotuner/genetic_tuning_harness.h         |  18 ++-
 tc/autotuner/utils/printer.cc                 |   7 +-
 tc/autotuner/utils/printer.h                  |   4 +-
 tc/core/flags.cc                              |  15 ++-
 tc/core/flags.h                               |   3 +-
 tc/examples/tensordot.cc                      |   3 +-
 .../pybinds/pybind_autotuner.cc               |  12 +-
 tensor_comprehensions/tc_unit.py              |   9 +-
 test/cuda/test_autotuner.cc                   |   4 +-
 test_python/common.py                         |   2 +-
 16 files changed, 180 insertions(+), 106 deletions(-)

diff --git a/docs/source/framework/pytorch_integration/autotuning_layers.rst b/docs/source/framework/pytorch_integration/autotuning_layers.rst
index 4574e4f85..f00ce658b 100644
--- a/docs/source/framework/pytorch_integration/autotuning_layers.rst
+++ b/docs/source/framework/pytorch_integration/autotuning_layers.rst
@@ -58,7 +58,6 @@ You can read about all the parameters here - :ref:`autotuner_parameters`.
 - :code:`threads` - set this to number of CPU cores available.
 - :code:`generations` - 5 to 10 generations is a good number.
 - :code:`pop_size` - 10 is usually reasonable. You can try 10 to 20.
-- :code:`number_elites` - number of candidates preserved intact between generations. `1` is usually sufficient.
 - :code:`min_launch_total_threads` - If you have really input small sizes, set this to `1`.
 - :code:`gpus`: Number of gpus to use for autotuning. Default value is "0". Set this to "0,1" if you wish to use two gpus (for example).
 
@@ -70,7 +69,7 @@ kernel timing. You can adopt the following parameter settings as starters for au
 .. code::
 
      settings = {
-         "threads": 32, "generations": 2, "pop_size": 10, "number_elites": 1
+         "threads": 32, "generations": 2, "pop_size": 10
      }
 
 * The good defaults that run for a bit longer (in exchange for better performance):
@@ -78,7 +77,7 @@ kernel timing. You can adopt the following parameter settings as starters for au
 .. code::
 
      settings = {
-         "threads": 32, "generations": 5, "pop_size": 10, "number_elites": 1
+         "threads": 32, "generations": 5, "pop_size": 10
      }
 
 
@@ -87,7 +86,7 @@ kernel timing. You can adopt the following parameter settings as starters for au
 .. code::
 
      settings = {
-         "threads": 32, "generations": 25, "pop_size": 100, "number_elites": 10
+         "threads": 32, "generations": 25, "pop_size": 100
      }
 
 
diff --git a/docs/source/tutorials/tutorial_tensordot_with_tc.rst b/docs/source/tutorials/tutorial_tensordot_with_tc.rst
index 6a98695df..52cc3deac 100644
--- a/docs/source/tutorials/tutorial_tensordot_with_tc.rst
+++ b/docs/source/tutorials/tutorial_tensordot_with_tc.rst
@@ -132,7 +132,7 @@ later.
 You can control the amount of autotuning by changing the autotuner parameters. See
 :ref:`autotune_parameters` for how to change the settings.
 
-For the setting ``settings={"generations": 25, "pop_size": 100, "number_elites": 10}``, we
+For the setting ``settings={"generations": 25, "pop_size": 100}``, we
 get a decent kernel performance as shown in the screenshot below (tuned on one M40 GPU):
 
 .. figure:: ../_static/img/autotuning-py.jpg
diff --git a/tc/autotuner/genetic_autotuner.cc b/tc/autotuner/genetic_autotuner.cc
index a4493eef2..bbf53426b 100644
--- a/tc/autotuner/genetic_autotuner.cc
+++ b/tc/autotuner/genetic_autotuner.cc
@@ -120,7 +120,8 @@ llvm::Optional<CudaMappingOptions> GeneticAutotuner::tune(
       FLAGS_tuner_gen_pop_size,
       FLAGS_tuner_gen_crossover_rate,
       FLAGS_tuner_gen_mutation_rate,
-      FLAGS_tuner_gen_number_elites,
+      FLAGS_tuner_gen_mating_pool_size,
+      FLAGS_tuner_gen_selection_pool_size,
       tcNameMap_.at(tcName),
       tcName,
       inputs,
diff --git a/tc/autotuner/genetic_search.cc b/tc/autotuner/genetic_search.cc
index bfa583869..709c2eb10 100644
--- a/tc/autotuner/genetic_search.cc
+++ b/tc/autotuner/genetic_search.cc
@@ -162,7 +162,8 @@ void dropInvalidConfigurations(GeneticSearch::Population& population) {
 } // namespace
 
 #define VALIDATE()                                     \
-  CHECK_LT(kNumberElites, kMaxPopulationSize);         \
+  CHECK_LT(kMaxPopulationSize, kMatingPoolSize);       \
+  CHECK_LT(kMaxPopulationSize, kSelectionPoolSize);    \
   CHECK(kMutationRate >= 0 and kMutationRate <= 100)   \
       << "the mutation rate (" << kMutationRate        \
       << ") should be in the [0,100] interval";        \
@@ -189,14 +190,15 @@ GeneticSearch::GeneticSearch(
     size_t n,
     uint8_t crossOverRate,
     uint8_t mutationRate,
-    size_t numberElites)
+    size_t matingPoolSize,
+    size_t selectionPoolSize)
     : population(),
       lastBestConf(confs[0]),
       kMaxPopulationSize(n),
-      kMatingPoolSize(n * 3),
+      kMatingPoolSize(matingPoolSize),
+      kSelectionPoolSize(selectionPoolSize),
       kCrossOverRate(crossOverRate),
       kMutationRate(mutationRate),
-      kNumberElites(numberElites),
       rng{std::random_device{}()} {
   restoreRngState(rng);
   VALIDATE();
@@ -222,14 +224,15 @@ GeneticSearch::GeneticSearch(
     size_t n,
     uint8_t crossOverRate,
     uint8_t mutationRate,
-    size_t numberElites)
+    size_t matingPoolSize,
+    size_t selectionPoolSize)
     : population(),
       lastBestConf(conf),
       kMaxPopulationSize(n),
-      kMatingPoolSize(n * 3),
+      kMatingPoolSize(matingPoolSize),
+      kSelectionPoolSize(selectionPoolSize),
       kCrossOverRate(crossOverRate),
       kMutationRate(mutationRate),
-      kNumberElites(numberElites),
       rng{std::random_device{}()} {
   restoreRngState(rng);
   VALIDATE();
@@ -301,13 +304,6 @@ void GeneticSearch::breed() {
   auto matingPool =
       stochasticUniversalSampling(computeAccumulatedFitness(population));
 
-  Population new_population;
-  new_population.reserve(kMatingPoolSize);
-  for (size_t c = 0; c < kNumberElites; ++c) {
-    new_population.push_back(
-        make_unique<CandidateConfiguration>(population.at(c)->configuration));
-  }
-
   auto select = [&]() -> TuningConfiguration& {
     auto idx = std::uniform_int_distribution<size_t>{
         size_t(0), matingPool.size() - 1}(rng);
@@ -323,45 +319,20 @@ void GeneticSearch::breed() {
     return dist(rng);
   };
 
-  while (new_population.size() < kMaxPopulationSize) {
+  while (selectionPool.size() < kSelectionPoolSize) {
     if (shouldCrossOver()) {
       auto parent1 = select();
       auto parent2 = select();
       auto parent3 = select();
-      new_population.emplace_back(make_unique<CandidateConfiguration>(
+      selectionPool.emplace_back(make_unique<CandidateConfiguration>(
           crossover(parent1, parent2, parent3)));
     } else {
-      new_population.emplace_back(
-          make_unique<CandidateConfiguration>(select()));
+      selectionPool.emplace_back(make_unique<CandidateConfiguration>(select()));
     }
   }
-  population = std::move(new_population);
 }
 
-void GeneticSearch::updateParameters() {
-  dropInvalidConfigurations(population);
-
-  // Sort population before taking any decision
-  std::sort(
-      population.begin(),
-      population.end(),
-      [](const std::unique_ptr<CandidateConfiguration>& a,
-         const std::unique_ptr<CandidateConfiguration>& b) {
-        checkRuntimeRecorded(a->runtime);
-        checkRuntimeRecorded(b->runtime);
-        return a->runtime < b->runtime;
-      });
-
-  // Update failsafe lastBestConf
-  lastBestConf =
-      population.size() > 0 ? population.front()->configuration : lastBestConf;
-  if (FLAGS_tuner_print_best) {
-    CudaMappingOptions options(
-        CudaMappingOptions::makeSingleThreadCudaMappingOptions());
-    lastBestConf.applyToCudaMappingOptions(options);
-    LOG(INFO) << "Best so far:\n" << options;
-  }
-
+void GeneticSearch::resetPopulationIfNotEnoughCandidates() {
   if (population.size() < kMinCandidatesForBreeding) {
     LOG_IF(ERROR, FLAGS_debug_tuner)
         << population.size() << " out of " << kMaxPopulationSize
@@ -380,12 +351,70 @@ void GeneticSearch::updateParameters() {
     // Don't lose the first one which was the best from before
     CHECK_LT(0, population.size());
     randomizePopulation(population.begin() + 1, population.end(), rng);
-    return;
   }
+}
 
+namespace {
+void sortByRuntime(GeneticSearch::Population& population) {
+  std::sort(
+      population.begin(),
+      population.end(),
+      [](const std::unique_ptr<CandidateConfiguration>& a,
+         const std::unique_ptr<CandidateConfiguration>& b) {
+        checkRuntimeRecorded(a->runtime);
+        checkRuntimeRecorded(b->runtime);
+        return a->runtime < b->runtime;
+      });
+}
+} // namespace
+
+void GeneticSearch::updateBestCandidate(const TuningConfiguration& c) {
+  lastBestConf = c;
+  if (FLAGS_tuner_print_best) {
+    CudaMappingOptions options(
+        CudaMappingOptions::makeSingleThreadCudaMappingOptions());
+    lastBestConf.applyToCudaMappingOptions(options);
+    LOG(INFO) << "Best so far:\n" << options;
+  }
+}
+
+void GeneticSearch::generateSelectionPool() {
+  dropInvalidConfigurations(population);
+  sortByRuntime(population);
+  updateBestCandidate(
+      population.size() > 0 ? population.front()->configuration : lastBestConf);
+  resetPopulationIfNotEnoughCandidates();
   breed();
-  for (int i = kNumberElites; i < population.size(); ++i) {
-    mutate(*population[i], kMutationRate, kMutateIterations, rng);
+  selectionPool.clear();
+  selectionPool.emplace_back(make_unique<CandidateConfiguration>(lastBestConf));
+  breed();
+  for (size_t i = 1; i < selectionPool.size(); ++i) {
+    mutate(*selectionPool[i], kMutationRate, kMutateIterations, rng);
+  }
+}
+
+void GeneticSearch::selectSurvivors() {
+  dropInvalidConfigurations(selectionPool);
+  sortByRuntime(selectionPool);
+  population.clear();
+  std::transform(
+      selectionPool.begin(),
+      selectionPool.begin() +
+          std::min(selectionPool.size(), kMaxPopulationSize),
+      std::back_inserter(population),
+      [](const std::unique_ptr<CandidateConfiguration>& c) {
+        return make_unique<CandidateConfiguration>(c->configuration);
+      });
+
+  if (selectionPool.size() < kMaxPopulationSize) {
+    auto numberMissing = kMaxPopulationSize - selectionPool.size();
+
+    for (size_t i = 0; i < numberMissing; ++i) {
+      selectionPool.emplace_back(
+          make_unique<CandidateConfiguration>(lastBestConf));
+    }
+    randomizePopulation(
+        selectionPool.end() - numberMissing, selectionPool.end(), rng);
   }
 }
 
diff --git a/tc/autotuner/genetic_search.h b/tc/autotuner/genetic_search.h
index 420891f6d..5683173e7 100644
--- a/tc/autotuner/genetic_search.h
+++ b/tc/autotuner/genetic_search.h
@@ -70,7 +70,8 @@ class GeneticSearch {
       size_t n,
       uint8_t crossOverRate,
       uint8_t mutationRate,
-      size_t numberElites);
+      size_t matingPoolSize,
+      size_t selectionPoolSize);
 
   /**
    * confs are used to seed the first generation, the rest of the population is
@@ -92,15 +93,22 @@ class GeneticSearch {
       size_t n,
       uint8_t crossOverRate,
       uint8_t mutationRate,
-      size_t numberElites);
+      size_t matingPoolSize,
+      size_t selectionPoolSize);
 
-  void updateParameters();
+  void generateSelectionPool();
+  void selectSurvivors();
 
  private:
   std::vector<TuningConfiguration> stochasticUniversalSampling(
       const std::vector<double>& fitness) const;
+
   void breed();
 
+  void updateBestCandidate(const TuningConfiguration& c);
+
+  void resetPopulationIfNotEnoughCandidates();
+
   TuningConfiguration crossover(
       TuningConfiguration&,
       TuningConfiguration&,
@@ -113,12 +121,13 @@ class GeneticSearch {
   using Population = std::vector<std::unique_ptr<CandidateConfiguration>>;
 
   Population population;
+  Population selectionPool;
   TuningConfiguration lastBestConf;
   const size_t kMaxPopulationSize;
   const size_t kMatingPoolSize;
+  const size_t kSelectionPoolSize;
   const uint8_t kCrossOverRate;
   const uint8_t kMutationRate;
-  const size_t kNumberElites;
 
   /*
    * c++11 seeding is (apparently) not of the highest quality:
diff --git a/tc/autotuner/genetic_tuning_harness.cc b/tc/autotuner/genetic_tuning_harness.cc
index 927adf627..a94dde69b 100644
--- a/tc/autotuner/genetic_tuning_harness.cc
+++ b/tc/autotuner/genetic_tuning_harness.cc
@@ -44,7 +44,8 @@ GeneticTunerHarness::GeneticTunerHarness(
     size_t n,
     uint8_t crossoverRate,
     uint8_t mutationRate,
-    size_t numberElites,
+    size_t matingPoolSize,
+    size_t selectionPoolSize,
     lang::TreeRef tc,
     std::string kernelName,
     const std::unordered_map<size_t, std::vector<const DLTensor*>>& inputs,
@@ -55,7 +56,8 @@ GeneticTunerHarness::GeneticTunerHarness(
     : kMaxPopulationSize(n),
       kCrossOverRate(crossoverRate),
       kMutationRate(mutationRate),
-      kNumberElites(numberElites),
+      kMatingPoolSize(matingPoolSize),
+      kSelectionPoolSize(selectionPoolSize),
       bestCudaMappingOptions_(baseMapping),
       kTc_(std::move(tc)),
       kKernelName_(std::move(kernelName)),
@@ -85,14 +87,16 @@ GeneticTunerHarness::GeneticTunerHarness(
         kMaxPopulationSize,
         kCrossOverRate,
         kMutationRate,
-        kNumberElites);
+        kMatingPoolSize,
+        kSelectionPoolSize);
   } else {
     tuner_ = make_unique<GeneticSearch>(
         configuration,
         kMaxPopulationSize,
         kCrossOverRate,
         kMutationRate,
-        kNumberElites);
+        kMatingPoolSize,
+        kSelectionPoolSize);
   }
 }
 
@@ -300,15 +304,17 @@ bool GeneticTunerHarness::warmupOrPrune(
   return false;
 }
 
-template <typename ExecutorType>
-void GeneticTunerHarness::doCompile(ExecutorType& engine) {
+template <typename ExecutorType, typename Population>
+void GeneticTunerHarness::doCompile(
+    ExecutorType& engine,
+    Population& population) {
   // Atomically fetch and add the next job until there are no jobs left
   while (true) {
     auto current = currentCompilationJob_.fetch_add(1);
-    if (current >= tuner_->population.size()) {
+    if (current >= population.size()) {
       break;
     }
-    auto& pConf = tuner_->population.at(current);
+    auto& pConf = population.at(current);
     auto options = makeOptions(*pConf);
     try {
       if (FLAGS_debug_tuner) {
@@ -340,10 +346,11 @@ void GeneticTunerHarness::doCompile(ExecutorType& engine) {
   }
 }
 
-template <typename ExecutorType>
+template <typename ExecutorType, typename Population>
 void GeneticTunerHarness::doGpuWork(
     size_t gpu,
     ExecutorType& engine,
+    Population& population,
     Printer& printer) {
   WithDevice wd(gpu);
   CHECK_EQ(1, kInputs_.count(gpu));
@@ -367,7 +374,7 @@ void GeneticTunerHarness::doGpuWork(
       // Found work to do, increment number of evaluations performed
       numEvaluations_.fetch_add(1);
     } else {
-      if (numEvaluations_.load() >= tuner_->population.size()) {
+      if (numEvaluations_.load() >= population.size()) {
         // No more work can arrive, exit
         return;
       }
@@ -376,7 +383,7 @@ void GeneticTunerHarness::doGpuWork(
       continue;
     }
 
-    auto& pConf = tuner_->population.at(current);
+    auto& pConf = population.at(current);
     if (pConf->invalid) {
       continue;
     }
@@ -479,17 +486,18 @@ void GeneticTunerHarness::runOneGeneration(size_t generation) {
   tc::ExecutionEngine<tc::CudaTcExecutor> engine;
   engine.define({kTc_});
 
-  {
+  auto setUpJobsAndRun = [&](GeneticSearch::Population& population,
+                             const std::string& printerText) {
     // Initialize for this round
     currentCompilationJob_.store(0);
     numEvaluations_.store(0);
     readyToEvaluate_.resize(0);
-    for (int i = 0; i < kMaxPopulationSize; ++i) {
+    for (size_t i = 0; i < population.size(); ++i) {
       readyToEvaluate_.emplace_back();
       readyToEvaluate_[i].store(false);
     }
     Printer printer(
-        generation,
+        printerText,
         readyToEvaluate_.size(),
         currentCompilationJob_,
         numEvaluations_);
@@ -510,8 +518,9 @@ void GeneticTunerHarness::runOneGeneration(size_t generation) {
       }
     });
     for (int i = 0; i < FLAGS_tuner_threads; ++i) {
-      cpuCompilationThreads.emplace_back(
-          [this, &engine]() { this->doCompile(engine); });
+      cpuCompilationThreads.emplace_back([this, &engine, &population]() {
+        this->doCompile(engine, population);
+      });
     }
 
     // Just spawn and join new threads for each generation
@@ -523,13 +532,18 @@ void GeneticTunerHarness::runOneGeneration(size_t generation) {
       }
     });
     for (auto gpu : gpus) {
-      gpuWorkerThreads.emplace_back([this, gpu, &engine, &printer]() {
-        this->doGpuWork(gpu, engine, printer);
-      });
+      gpuWorkerThreads.emplace_back(
+          [this, gpu, &engine, &population, &printer]() {
+            this->doGpuWork(gpu, engine, population, printer);
+          });
     }
-  }
-
-  // At this point everything is synchronized because out of scope, done
+    // At this point everything is synchronized because out of scope, done
+  };
+  std::cout << "Generation " << generation << std::endl;
+  setUpJobsAndRun(tuner_->population, "Population");
+  tuner_->generateSelectionPool();
+  setUpJobsAndRun(tuner_->selectionPool, "Selection Pool");
+  tuner_->selectSurvivors();
 
   if (FLAGS_debug_tuner) {
     LOG(INFO) << "[TUNER][GENERATION LOG] best option so far:";
@@ -538,7 +552,6 @@ void GeneticTunerHarness::runOneGeneration(size_t generation) {
     infoPrinter << bestMappingOption();
     LOG_LINE_BY_LINE(INFO, ssInfo);
   }
-  tuner_->updateParameters();
 }
 
 } // namespace detail
diff --git a/tc/autotuner/genetic_tuning_harness.h b/tc/autotuner/genetic_tuning_harness.h
index 0c7b476c0..f3e5b3495 100644
--- a/tc/autotuner/genetic_tuning_harness.h
+++ b/tc/autotuner/genetic_tuning_harness.h
@@ -38,7 +38,8 @@ class GeneticTunerHarness {
       size_t n,
       uint8_t crossoverRate,
       uint8_t mutationRate,
-      size_t numberElites,
+      size_t matingPoolSize,
+      size_t selectionPoolSize,
       lang::TreeRef tc,
       std::string kernelName,
       const std::unordered_map<size_t, std::vector<const DLTensor*>>& inputs,
@@ -66,12 +67,16 @@ class GeneticTunerHarness {
       size_t bestTimeSoFar);
 
   /// Helper function to delegate compiling on the cpu to different threads
-  template <typename ExecutorType>
-  void doCompile(ExecutorType& engine);
+  template <typename ExecutorType, typename Population>
+  void doCompile(ExecutorType& engine, Population& population);
 
   /// Helper function to delegate running on the gpu to different threads
-  template <typename ExecutorType>
-  void doGpuWork(size_t gpu, ExecutorType& engine, Printer& printer);
+  template <typename ExecutorType, typename Population>
+  void doGpuWork(
+      size_t gpu,
+      ExecutorType& engine,
+      Population& population,
+      Printer& printer);
 
   /// Make options from conf
   tc::CudaMappingOptions makeOptions(const CandidateConfiguration& conf);
@@ -90,7 +95,8 @@ class GeneticTunerHarness {
   const size_t kMaxPopulationSize;
   const uint8_t kCrossOverRate;
   const uint8_t kMutationRate;
-  const size_t kNumberElites;
+  const size_t kMatingPoolSize;
+  const size_t kSelectionPoolSize;
 
   TuningConfiguration configuration;
 
diff --git a/tc/autotuner/utils/printer.cc b/tc/autotuner/utils/printer.cc
index d6a39c545..3b9d218fe 100644
--- a/tc/autotuner/utils/printer.cc
+++ b/tc/autotuner/utils/printer.cc
@@ -41,8 +41,7 @@ void Printer::printLoop() {
     std::this_thread::sleep_for(std::chrono::seconds(1));
 
     std::stringstream ss;
-    ss << "Generation " << generation_;
-    ss << "\tJobs(Compiled, GPU)/total  ("
+    ss << prefix_ << "\tJobs(Compiled, GPU)/total  ("
        << std::min(total_, currentCompilationJob_.load()) << ", "
        << std::min(total_, numEvaluations_.load()) << ")/" << total_;
 
@@ -76,11 +75,11 @@ void Printer::printLoop() {
 }
 
 Printer::Printer(
-    size_t generation,
+    std::string prefix,
     size_t total,
     const std::atomic_size_t& currentCompilationJob,
     const std::atomic_size_t& numEvaluations)
-    : generation_(generation),
+    : prefix_(std::move(prefix)),
       printerThread_([this]() { printLoop(); }),
       total_(total),
       currentCompilationJob_(currentCompilationJob),
diff --git a/tc/autotuner/utils/printer.h b/tc/autotuner/utils/printer.h
index 4a2760564..b2ddff853 100644
--- a/tc/autotuner/utils/printer.h
+++ b/tc/autotuner/utils/printer.h
@@ -33,7 +33,7 @@ namespace autotune {
 class Printer {
  public:
   Printer(
-      size_t generation,
+      std::string prefix,
       size_t total,
       const std::atomic_size_t& currentCompilationJob,
       const std::atomic_size_t& numEvaluations);
@@ -47,7 +47,7 @@ class Printer {
  private:
   void printLoop();
 
-  size_t generation_;
+  std::string prefix_;
   std::vector<Duration> runtimes_;
   mutable std::mutex runtimesMtx_;
 
diff --git a/tc/core/flags.cc b/tc/core/flags.cc
index e2f4f5a25..2e328da8a 100644
--- a/tc/core/flags.cc
+++ b/tc/core/flags.cc
@@ -59,6 +59,17 @@ DEFINE_uint32(
     tuner_gen_pop_size,
     100,
     "Population size for genetic autotuning");
+
+DEFINE_uint32(
+    tuner_gen_mating_pool_size,
+    300,
+    "Mating pool size for genetic autotuning");
+
+DEFINE_uint32(
+    tuner_gen_selection_pool_size,
+    300,
+    "Selection pool size for genetic autotuning");
+
 DEFINE_uint32(
     tuner_gen_crossover_rate,
     80,
@@ -71,10 +82,6 @@ DEFINE_uint32(
     tuner_gen_generations,
     25,
     "How many generations to run genetic tuning for");
-DEFINE_uint32(
-    tuner_gen_number_elites,
-    10,
-    "The number of best candidates that are preserved intact between generations");
 DEFINE_uint32(tuner_threads, 1, "Number of CPU threads to use when autotuning");
 DEFINE_string(
     tuner_gpus,
diff --git a/tc/core/flags.h b/tc/core/flags.h
index 0324b00ad..c430dcb05 100644
--- a/tc/core/flags.h
+++ b/tc/core/flags.h
@@ -40,10 +40,11 @@ DECLARE_uint32(benchmark_iterations);
 
 // Used in autotuning
 DECLARE_uint32(tuner_gen_pop_size);
+DECLARE_uint32(tuner_gen_mating_pool_size);
+DECLARE_uint32(tuner_gen_selection_pool_size);
 DECLARE_uint32(tuner_gen_crossover_rate);
 DECLARE_uint32(tuner_gen_mutation_rate);
 DECLARE_uint32(tuner_gen_generations);
-DECLARE_uint32(tuner_gen_number_elites);
 DECLARE_uint32(tuner_threads);
 DECLARE_string(tuner_gpus);
 DECLARE_bool(tuner_print_best);
diff --git a/tc/examples/tensordot.cc b/tc/examples/tensordot.cc
index a87e1606b..0e463e775 100644
--- a/tc/examples/tensordot.cc
+++ b/tc/examples/tensordot.cc
@@ -89,7 +89,8 @@ def tensordot(float(N, C1, C2, H, W) I0,
 
 // From root, run with:
 //   ./build/examples/tensordot --tuner_threads=10 --tuner_gen_pop_size=10
-//   --tuner_gen_generations=3 --tuner_gen_number_elites=4
+//   --tuner_gen_generations=3 --tuner_gen_mating_pool_size=20
+//   --tuner_gen_selection_pool_size=20
 //   --proto_path="/tmp/tensordot"
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
diff --git a/tensor_comprehensions/pybinds/pybind_autotuner.cc b/tensor_comprehensions/pybinds/pybind_autotuner.cc
index 4a5337ca2..3a70ae8a6 100644
--- a/tensor_comprehensions/pybinds/pybind_autotuner.cc
+++ b/tensor_comprehensions/pybinds/pybind_autotuner.cc
@@ -70,10 +70,16 @@ PYBIND11_MODULE(autotuner, m) {
             tc::FLAGS_tuner_gen_generations = generations;
           })
       .def(
-          "number_elites",
+          "mating_pool_size",
           [](tc::autotune::GeneticAutotunerATen& instance,
-             uint32_t& number_elites) {
-            tc::FLAGS_tuner_gen_number_elites = number_elites;
+             uint32_t& mating_pool_size) {
+            tc::FLAGS_tuner_gen_mating_pool_size = mating_pool_size;
+          })
+      .def(
+          "selection_pool_size",
+          [](tc::autotune::GeneticAutotunerATen& instance,
+             uint32_t& selection_pool_size) {
+            tc::FLAGS_tuner_gen_selection_pool_size = selection_pool_size;
           })
       .def(
           "threads",
diff --git a/tensor_comprehensions/tc_unit.py b/tensor_comprehensions/tc_unit.py
index f512fad30..8185a00cb 100644
--- a/tensor_comprehensions/tc_unit.py
+++ b/tensor_comprehensions/tc_unit.py
@@ -193,6 +193,9 @@ def __init__(self, tc_lang, **kwargs):
     def set_autotuner_parameters(
         self, pop_size=20, crossover_rate=80, mutation_rate=7, generations=10,
         number_elites=1, threads=8, gpus="0", restore_from_proto=False,
+        restore_number=10, log_generations=False
+        mating_pool_size=60, selection_pool_size=60,
+        threads=8, gpus="0", restore_from_proto=False,
         restore_number=10, log_generations=False, save_best_candidates_count=10,
         tuner_min_launch_total_threads=64, **kwargs
     ):
@@ -200,7 +203,8 @@ def set_autotuner_parameters(
         self.autotuner.crossover_rate(crossover_rate)
         self.autotuner.mutation_rate(mutation_rate)
         self.autotuner.generations(generations)
-        self.autotuner.number_elites(number_elites)
+        self.autotuner.mating_pool_size(mating_pool_size)
+        self.autotuner.selection_pool_size(selection_pool_size)
         self.autotuner.threads(threads)
         self.autotuner.gpus(gpus)
         self.autotuner.restore_from_proto(restore_from_proto)
@@ -575,9 +579,6 @@ def autotune(self, *inputs, **kwargs):
             mutation_rate (int):
                 rate at which candidate options are randomly changed (mutated). Default 7
 
-            number_elites (int):
-                number of best candidates that are preserved intact between generations (without any mutations). Default 10
-
             threads (int):
                 The number of threads that are used to compile different candidates in parallel. Default 1
 
diff --git a/test/cuda/test_autotuner.cc b/test/cuda/test_autotuner.cc
index 7de32a0c3..8871d7037 100644
--- a/test/cuda/test_autotuner.cc
+++ b/test/cuda/test_autotuner.cc
@@ -51,7 +51,9 @@ struct ATenCompilationUnitTest : public ::testing::Test {
       tc::FLAGS_tuner_gen_pop_size = 8;
       tc::FLAGS_tuner_gen_generations = 5;
       tc::FLAGS_tuner_threads = std::min(8u, tc::FLAGS_tuner_gen_pop_size);
-      tc::FLAGS_tuner_gen_number_elites = tc::FLAGS_tuner_gen_pop_size / 4;
+      tc::FLAGS_tuner_gen_mating_pool_size = tc::FLAGS_tuner_gen_pop_size * 3;
+      tc::FLAGS_tuner_gen_selection_pool_size =
+          tc::FLAGS_tuner_gen_pop_size * 3;
     }
   }
 
diff --git a/test_python/common.py b/test_python/common.py
index 0ccd9605f..18f887892 100644
--- a/test_python/common.py
+++ b/test_python/common.py
@@ -36,7 +36,7 @@ def assert_almost_equal(self, diff, inputs, operations, precision=1e-7):
 
     def autotune_store(self, cache_file, lang, tc_name, inputs, tc_type):
         tuner = TcAutotuner(
-            lang, threads=16, pop_size=10, number_elites=1, generations=1
+            lang, threads=16, pop_size=10, generations=1
         )
         best_options = tuner.tune_and_store(
             tc_name, inputs, mapping_options=tc_type, cache_file=cache_file

From fefa81863a41bd918b5c7e5a4f82bec38dfea185 Mon Sep 17 00:00:00 2001
From: Theodoros Theodoridis <theodoridisgr@gmail.com>
Date: Fri, 16 Mar 2018 13:56:01 +0100
Subject: [PATCH 5/9] [genetic tuning] Use cached runtimes if available

Some candidates survive across across generations. Their runtimes
are stored in the options cache. Previously those candidates would
be benchmarked everytime they were encountered. Now the runtimes
are restored from the caches instead.
---
 tc/autotuner/genetic_tuning_harness.cc | 122 ++++++++++++++++---------
 1 file changed, 80 insertions(+), 42 deletions(-)

diff --git a/tc/autotuner/genetic_tuning_harness.cc b/tc/autotuner/genetic_tuning_harness.cc
index a94dde69b..024303210 100644
--- a/tc/autotuner/genetic_tuning_harness.cc
+++ b/tc/autotuner/genetic_tuning_harness.cc
@@ -346,6 +346,39 @@ void GeneticTunerHarness::doCompile(
   }
 }
 
+namespace {
+std::vector<const DLTensor*> toConstDlpackTensors(
+    const std::vector<DLTensor*>& v) {
+  std::vector<const DLTensor*> out(v.begin(), v.end());
+  return out;
+}
+} // namespace
+
+template <typename ExecutorType>
+std::vector<Duration> retrieveCachedRuntimes(
+    ExecutorType& engine,
+    const std::string& id,
+    const std::vector<const DLTensor*>& inputs,
+    const std::vector<DLTensor*>& outputs,
+    const CudaMappingOptions& options) {
+  if (not OptionsCache::cacheEnabled()) {
+    return {};
+  }
+  auto cache = OptionsCache::getCache();
+  auto allResults = cache->retrieveOptionsAndRuntimes(
+      id, inputs, toConstDlpackTensors(outputs));
+  auto wantedResult = std::find_if(
+      allResults.begin(),
+      allResults.end(),
+      [&options](const OptionsCache::RetrievalResult& r) {
+        return r.options == options;
+      });
+  if (wantedResult == allResults.end()) {
+    return {};
+  }
+  return wantedResult->recordedRuntimes;
+}
+
 template <typename ExecutorType, typename Population>
 void GeneticTunerHarness::doGpuWork(
     size_t gpu,
@@ -399,51 +432,56 @@ void GeneticTunerHarness::doGpuWork(
       LOG_LINE_BY_LINE(INFO, ssInfo);
     }
 
-    std::vector<Duration> runtimes;
-    try {
-      size_t bestTimeSoFar;
-      {
-        std::lock_guard<std::mutex> lock(bestTimeMtx_);
-        bestTimeSoFar = bestTime_;
-      }
-      auto prune =
-          warmupOrPrune(engine, outputs, inputs, handle, bestTimeSoFar);
-      if (prune) {
+    auto runtimes =
+        retrieveCachedRuntimes(engine, kKernelName_, inputs, outputs, options);
+    if (runtimes.empty()) {
+      try {
+        size_t bestTimeSoFar;
+        {
+          std::lock_guard<std::mutex> lock(bestTimeMtx_);
+          bestTimeSoFar = bestTime_;
+        }
+        auto prune =
+            warmupOrPrune(engine, outputs, inputs, handle, bestTimeSoFar);
+        if (prune) {
+          pConf->invalid = true;
+          continue;
+        } else {
+          runtimes.reserve(kReducedBenchmarkIterations);
+          for (size_t i = 0; i < kReducedBenchmarkIterations; ++i) {
+            runtimes.push_back(engine.run(handle, inputs, outputs, true));
+          }
+          engine.clear(handle);
+        }
+      } catch (std::exception& e) {
+        if (FLAGS_debug_tuner) {
+          LOG(WARNING) << "Runtime error gpu " << gpu << ": " << e.what();
+          std::stringstream ssWarning;
+          CudaMappingOptionsCppPrinter warningPrinter(ssWarning);
+          warningPrinter << options;
+          LOG(WARNING) << "Aborted execution on gpu " << gpu;
+          LOG_LINE_BY_LINE(WARNING, ssWarning);
+        }
+        while (cudaGetLastError() != cudaSuccess) {
+          // In case of errors in the generated, we cannot rely on deviceReset
+          // to set the GPU in a clean state. So instead we just pop and discard
+          // all the errors accumulated on the GPU until we get to a clean slate
+          // (i.e. cudaSuccess).
+          ;
+        }
+        try {
+          // Some errors, such as illegal memory access, cannot be recovered
+          // from without a cudaDeviceReset (i.e. because user protection) In
+          // those cases we have no choice than to fail hard.
+          TC_CUDA_RUNTIMEAPI_ENFORCE(cudaDeviceSynchronize());
+        } catch (const std::exception& e) {
+          LOG(FATAL) << "[CUDA][FATAL] cuda error on gpu " << gpu << ": "
+                     << e.what() << "\n"
+                     << CudaMappingOptionsAsCpp(options);
+        }
         pConf->invalid = true;
         continue;
-      } else {
-        runtimes.reserve(kReducedBenchmarkIterations);
-        for (size_t i = 0; i < kReducedBenchmarkIterations; ++i) {
-          runtimes.push_back(engine.run(handle, inputs, outputs, true));
-        }
-        engine.clear(handle);
       }
-    } catch (std::exception& e) {
-      LOG(WARNING) << "Runtime error gpu " << gpu << ": " << e.what();
-      std::stringstream ssWarning;
-      CudaMappingOptionsCppPrinter warningPrinter(ssWarning);
-      warningPrinter << options;
-      LOG(WARNING) << "Aborted execution on gpu " << gpu;
-      LOG_LINE_BY_LINE(WARNING, ssWarning);
-      while (cudaGetLastError() != cudaSuccess) {
-        // In case of errors in the generated, we cannot rely on deviceReset to
-        // set the GPU in a clean state. So instead we just pop and discard all
-        // the errors accumulated on the GPU until we get to a clean slate
-        // (i.e. cudaSuccess).
-        ;
-      }
-      try {
-        // Some errors, such as illegal memory access, cannot be recovered from
-        // without a cudaDeviceReset (i.e. because user protection)
-        // In those cases we have no choice than to fail hard.
-        TC_CUDA_RUNTIMEAPI_ENFORCE(cudaDeviceSynchronize());
-      } catch (const std::exception& e) {
-        LOG(FATAL) << "[CUDA][FATAL] cuda error on gpu " << gpu << ": "
-                   << e.what() << "\n"
-                   << CudaMappingOptionsAsCpp(options);
-      }
-      pConf->invalid = true;
-      continue;
     }
 
     auto prof = median(runtimes);

From ea3453cf0a64548b9c66732458c2d7bc753dd791 Mon Sep 17 00:00:00 2001
From: Theodoros Theodoridis <theodoridisgr@gmail.com>
Date: Fri, 16 Mar 2018 15:30:42 +0100
Subject: [PATCH 6/9] [genetic tuning] Create jobs only for new candidates

Candidates that survive across generations need not be benchmarked
again and thus no compilation and gpu jobs have to be created for
them.
---
 tc/autotuner/genetic_search.cc         |   3 +-
 tc/autotuner/genetic_tuning_harness.cc | 113 +++++++++++++++----------
 2 files changed, 68 insertions(+), 48 deletions(-)

diff --git a/tc/autotuner/genetic_search.cc b/tc/autotuner/genetic_search.cc
index 709c2eb10..8bd3df169 100644
--- a/tc/autotuner/genetic_search.cc
+++ b/tc/autotuner/genetic_search.cc
@@ -403,7 +403,8 @@ void GeneticSearch::selectSurvivors() {
           std::min(selectionPool.size(), kMaxPopulationSize),
       std::back_inserter(population),
       [](const std::unique_ptr<CandidateConfiguration>& c) {
-        return make_unique<CandidateConfiguration>(c->configuration);
+        CHECK(c);
+        return make_unique<CandidateConfiguration>(*c);
       });
 
   if (selectionPool.size() < kMaxPopulationSize) {
diff --git a/tc/autotuner/genetic_tuning_harness.cc b/tc/autotuner/genetic_tuning_harness.cc
index 024303210..b7dd356e2 100644
--- a/tc/autotuner/genetic_tuning_harness.cc
+++ b/tc/autotuner/genetic_tuning_harness.cc
@@ -314,6 +314,7 @@ void GeneticTunerHarness::doCompile(
     if (current >= population.size()) {
       break;
     }
+
     auto& pConf = population.at(current);
     auto options = makeOptions(*pConf);
     try {
@@ -432,7 +433,7 @@ void GeneticTunerHarness::doGpuWork(
       LOG_LINE_BY_LINE(INFO, ssInfo);
     }
 
-    auto runtimes =
+    std::vector<Duration> runtimes =
         retrieveCachedRuntimes(engine, kKernelName_, inputs, outputs, options);
     if (runtimes.empty()) {
       try {
@@ -526,59 +527,77 @@ void GeneticTunerHarness::runOneGeneration(size_t generation) {
 
   auto setUpJobsAndRun = [&](GeneticSearch::Population& population,
                              const std::string& printerText) {
-    // Initialize for this round
-    currentCompilationJob_.store(0);
-    numEvaluations_.store(0);
-    readyToEvaluate_.resize(0);
-    for (size_t i = 0; i < population.size(); ++i) {
-      readyToEvaluate_.emplace_back();
-      readyToEvaluate_[i].store(false);
-    }
-    Printer printer(
-        printerText,
-        readyToEvaluate_.size(),
-        currentCompilationJob_,
-        numEvaluations_);
-    auto logGenerations = FLAGS_tuner_gen_log_generations;
-    ScopeGuard sgPrinter([logGenerations, &printer]() {
-      printer.stop();
-      if (logGenerations) {
-        printer.printAll();
-      }
-    });
-
-    // Just spawn and join new threads for each generation
-    std::vector<std::thread> cpuCompilationThreads;
-    cpuCompilationThreads.reserve(FLAGS_tuner_threads);
-    ScopeGuard sgCompilationThreads([&cpuCompilationThreads]() {
-      for (auto& cpuCompilationThread : cpuCompilationThreads) {
-        cpuCompilationThread.join();
+    // Most candidates should have been evaluated during the previous
+    // generation's selection phase.
+    // There are two exceptions:
+    // 1) the 1st generation
+    // 2) too many invalid configurations were previously encounted and the
+    //    valid ones were not enough to form a new generation.
+    auto firstNew = std::partition(
+        population.begin(),
+        population.end(),
+        [](const std::unique_ptr<CandidateConfiguration>& c) {
+          return c->runtime != Duration::zero();
+        });
+    GeneticSearch::Population newCandidates(
+        std::distance(firstNew, population.end()));
+    std::move(firstNew, population.end(), newCandidates.begin());
+    {
+      // Initialize for this round
+      currentCompilationJob_.store(0);
+      numEvaluations_.store(0);
+      readyToEvaluate_.resize(0);
+      for (size_t i = 0; i < newCandidates.size(); ++i) {
+        readyToEvaluate_.emplace_back();
+        readyToEvaluate_[i].store(false);
       }
-    });
-    for (int i = 0; i < FLAGS_tuner_threads; ++i) {
-      cpuCompilationThreads.emplace_back([this, &engine, &population]() {
-        this->doCompile(engine, population);
+      Printer printer(
+          printerText,
+          readyToEvaluate_.size(),
+          currentCompilationJob_,
+          numEvaluations_);
+      auto logGenerations = FLAGS_tuner_gen_log_generations;
+      ScopeGuard sgPrinter([logGenerations, &printer]() {
+        printer.stop();
+        if (logGenerations) {
+          printer.printAll();
+        }
       });
-    }
 
-    // Just spawn and join new threads for each generation
-    std::vector<std::thread> gpuWorkerThreads;
-    gpuWorkerThreads.reserve(gpus.size());
-    ScopeGuard sgGpuWorkerThreads([&gpuWorkerThreads]() {
-      for (auto& gpuWorkerThread : gpuWorkerThreads) {
-        gpuWorkerThread.join();
+      // Just spawn and join new threads for each generation
+      std::vector<std::thread> cpuCompilationThreads;
+      cpuCompilationThreads.reserve(FLAGS_tuner_threads);
+      ScopeGuard sgCompilationThreads([&cpuCompilationThreads]() {
+        for (auto& cpuCompilationThread : cpuCompilationThreads) {
+          cpuCompilationThread.join();
+        }
+      });
+      for (int i = 0; i < FLAGS_tuner_threads; ++i) {
+        cpuCompilationThreads.emplace_back([this, &engine, &newCandidates]() {
+          this->doCompile(engine, newCandidates);
+        });
+      }
+
+      // Just spawn and join new threads for each generation
+      std::vector<std::thread> gpuWorkerThreads;
+      gpuWorkerThreads.reserve(gpus.size());
+      ScopeGuard sgGpuWorkerThreads([&gpuWorkerThreads]() {
+        for (auto& gpuWorkerThread : gpuWorkerThreads) {
+          gpuWorkerThread.join();
+        }
+      });
+      for (auto gpu : gpus) {
+        gpuWorkerThreads.emplace_back(
+            [this, gpu, &engine, &newCandidates, &printer]() {
+              this->doGpuWork(gpu, engine, newCandidates, printer);
+            });
       }
-    });
-    for (auto gpu : gpus) {
-      gpuWorkerThreads.emplace_back(
-          [this, gpu, &engine, &population, &printer]() {
-            this->doGpuWork(gpu, engine, population, printer);
-          });
     }
     // At this point everything is synchronized because out of scope, done
+    std::move(newCandidates.begin(), newCandidates.end(), firstNew);
   };
-  std::cout << "Generation " << generation << std::endl;
-  setUpJobsAndRun(tuner_->population, "Population");
+  std::cout << "Generation " << generation << ':' << std::endl;
+  setUpJobsAndRun(tuner_->population, "New Candidates");
   tuner_->generateSelectionPool();
   setUpJobsAndRun(tuner_->selectionPool, "Selection Pool");
   tuner_->selectSurvivors();

From a05e0cc5978b8c31a4b5e6fdef0be4a2a9be4921 Mon Sep 17 00:00:00 2001
From: Theodoros Theodoridis <theodoridisgr@gmail.com>
Date: Fri, 16 Mar 2018 15:43:05 +0100
Subject: [PATCH 7/9] Move mathematical functions to utils.h

---
 tc/autotuner/genetic_search.cc | 35 ++--------------------------------
 tc/autotuner/utils/utils.cc    | 34 +++++++++++++++++++++++++++++++++
 tc/autotuner/utils/utils.h     |  4 ++++
 3 files changed, 40 insertions(+), 33 deletions(-)

diff --git a/tc/autotuner/genetic_search.cc b/tc/autotuner/genetic_search.cc
index 8bd3df169..c813bcd70 100644
--- a/tc/autotuner/genetic_search.cc
+++ b/tc/autotuner/genetic_search.cc
@@ -21,6 +21,8 @@
 #include <random>
 #include <sstream>
 
+#include "tc/autotuner/utils/utils.h"
+
 namespace tc {
 namespace autotune {
 
@@ -74,39 +76,6 @@ void mutate(
   }
 }
 
-double mean(std::vector<double>& v) {
-  if (v.empty()) {
-    throw std::invalid_argument("Cannot compute the mean of an empty vector.");
-  }
-  auto sum = std::accumulate(v.begin(), v.end(), 0.0);
-  return sum / v.size();
-}
-
-double stdv(std::vector<double>& v, double mean) {
-  std::vector<double> diffs(v.size());
-  std::transform(v.begin(), v.end(), diffs.begin(), [mean](double val) {
-    return val - mean;
-  });
-
-  auto squareSum =
-      std::inner_product(diffs.begin(), diffs.end(), diffs.begin(), 0.0);
-  return std::sqrt(squareSum / v.size());
-}
-
-void sigmaScale(std::vector<double>& v) {
-  auto m = mean(v);
-  auto s = stdv(v, m);
-  std::transform(v.begin(), v.end(), v.begin(), [m, s](double val) {
-    return std::max(val - (m - 2 * s), 0.0);
-  });
-}
-
-void normalizeVector(std::vector<double>& v) {
-  auto sum = std::accumulate(v.begin(), v.end(), 0.0);
-  std::transform(
-      v.begin(), v.end(), v.begin(), [sum](double v) { return v / sum; });
-}
-
 std::vector<double> computeNormalizedFitness(
     const GeneticSearch::Population& population) {
   std::vector<double> fitness;
diff --git a/tc/autotuner/utils/utils.cc b/tc/autotuner/utils/utils.cc
index e38e752cd..3efeeb17c 100644
--- a/tc/autotuner/utils/utils.cc
+++ b/tc/autotuner/utils/utils.cc
@@ -15,6 +15,7 @@
  */
 #include <algorithm>
 #include <cmath>
+#include <numeric>
 
 #include "tc/aten/aten_compiler.h"
 #include "tc/autotuner/utils/utils.h"
@@ -109,5 +110,38 @@ llvm::Optional<CudaMappingOptions> getBestOptions(
   return llvm::Optional<CudaMappingOptions>{};
 }
 
+double mean(std::vector<double>& v) {
+  if (v.empty()) {
+    throw std::invalid_argument("Cannot compute the mean of an empty vector.");
+  }
+  auto sum = std::accumulate(v.begin(), v.end(), 0.0);
+  return sum / v.size();
+}
+
+double stdv(std::vector<double>& v, double mean) {
+  std::vector<double> diffs(v.size());
+  std::transform(v.begin(), v.end(), diffs.begin(), [mean](double val) {
+    return val - mean;
+  });
+
+  auto squareSum =
+      std::inner_product(diffs.begin(), diffs.end(), diffs.begin(), 0.0);
+  return std::sqrt(squareSum / v.size());
+}
+
+void sigmaScale(std::vector<double>& v) {
+  auto m = mean(v);
+  auto s = stdv(v, m);
+  std::transform(v.begin(), v.end(), v.begin(), [m, s](double val) {
+    return std::max(val - (m - 2 * s), 0.0);
+  });
+}
+
+void normalizeVector(std::vector<double>& v) {
+  auto sum = std::accumulate(v.begin(), v.end(), 0.0);
+  std::transform(
+      v.begin(), v.end(), v.begin(), [sum](double v) { return v / sum; });
+}
+
 } // namespace autotune
 } // namespace tc
diff --git a/tc/autotuner/utils/utils.h b/tc/autotuner/utils/utils.h
index 2e5367e92..33c659c93 100644
--- a/tc/autotuner/utils/utils.h
+++ b/tc/autotuner/utils/utils.h
@@ -59,6 +59,10 @@ struct OptionsWithMedianTime {
 std::vector<OptionsWithMedianTime> getOptionsAndMedianRuntimes(
     const lang::CanonicalTcString& id,
     const std::vector<const DLTensor*>& inputs);
+double mean(std::vector<double>& v);
+double stdv(std::vector<double>& v, double mean);
+void normalizeVector(std::vector<double>& v);
+void sigmaScale(std::vector<double>& v);
 
 } // namespace autotune
 } // namespace tc

From d83258371121386732a77a8f373bd50dedf07e82 Mon Sep 17 00:00:00 2001
From: Theodoros Theodoridis <theodoridisgr@gmail.com>
Date: Fri, 16 Mar 2018 16:02:08 +0100
Subject: [PATCH 8/9] [genetic search] Handle invalid generations properly

---
 tc/autotuner/genetic_search.cc | 16 +++++++++++++---
 tc/autotuner/genetic_search.h  |  2 +-
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/tc/autotuner/genetic_search.cc b/tc/autotuner/genetic_search.cc
index c813bcd70..7d514eeb6 100644
--- a/tc/autotuner/genetic_search.cc
+++ b/tc/autotuner/genetic_search.cc
@@ -301,7 +301,7 @@ void GeneticSearch::breed() {
   }
 }
 
-void GeneticSearch::resetPopulationIfNotEnoughCandidates() {
+bool GeneticSearch::resetPopulationIfNotEnoughCandidates() {
   if (population.size() < kMinCandidatesForBreeding) {
     LOG_IF(ERROR, FLAGS_debug_tuner)
         << population.size() << " out of " << kMaxPopulationSize
@@ -320,7 +320,16 @@ void GeneticSearch::resetPopulationIfNotEnoughCandidates() {
     // Don't lose the first one which was the best from before
     CHECK_LT(0, population.size());
     randomizePopulation(population.begin() + 1, population.end(), rng);
+
+    selectionPool.clear();
+    for (size_t i = 0; i < kSelectionPoolSize; ++i) {
+      selectionPool.emplace_back(
+          make_unique<CandidateConfiguration>(lastBestConf));
+    }
+    randomizePopulation(selectionPool.begin() + 1, selectionPool.end(), rng);
+    return true;
   }
+  return false;
 }
 
 namespace {
@@ -352,8 +361,9 @@ void GeneticSearch::generateSelectionPool() {
   sortByRuntime(population);
   updateBestCandidate(
       population.size() > 0 ? population.front()->configuration : lastBestConf);
-  resetPopulationIfNotEnoughCandidates();
-  breed();
+  if (resetPopulationIfNotEnoughCandidates()) {
+    return;
+  }
   selectionPool.clear();
   selectionPool.emplace_back(make_unique<CandidateConfiguration>(lastBestConf));
   breed();
diff --git a/tc/autotuner/genetic_search.h b/tc/autotuner/genetic_search.h
index 5683173e7..1c89ca5ee 100644
--- a/tc/autotuner/genetic_search.h
+++ b/tc/autotuner/genetic_search.h
@@ -107,7 +107,7 @@ class GeneticSearch {
 
   void updateBestCandidate(const TuningConfiguration& c);
 
-  void resetPopulationIfNotEnoughCandidates();
+  bool resetPopulationIfNotEnoughCandidates();
 
   TuningConfiguration crossover(
       TuningConfiguration&,

From f340b602567b17e86905fe48d47ef3d880e86c77 Mon Sep 17 00:00:00 2001
From: Theodoros Theodoridis <theodoridisgr@gmail.com>
Date: Tue, 20 Mar 2018 13:37:36 +0100
Subject: [PATCH 9/9] [genetic tuning] Do not print confusing stats

Each generation includes two evalution phases (marked with *):
1) Benchmarking of candidates *
2) Breeding
3) Benchmarking of new candidates and selection *

Except for the first generation usually most of the candidates of
step 1) will have been evaluated during step 3) of the previous
generation.

This commit removed the "Jobs(Compiled,GPU) (0,0)/0" that was
printed during step 1) of almost every generation.

In theory it is possible not to have enough valid candidates to
form a new generation. In those cases step 1) will be run and
the progress message will be printed
---
 tc/autotuner/genetic_tuning_harness.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tc/autotuner/genetic_tuning_harness.cc b/tc/autotuner/genetic_tuning_harness.cc
index b7dd356e2..df8922b9d 100644
--- a/tc/autotuner/genetic_tuning_harness.cc
+++ b/tc/autotuner/genetic_tuning_harness.cc
@@ -539,6 +539,9 @@ void GeneticTunerHarness::runOneGeneration(size_t generation) {
         [](const std::unique_ptr<CandidateConfiguration>& c) {
           return c->runtime != Duration::zero();
         });
+    if (std::distance(firstNew, population.end()) == 0) {
+      return;
+    }
     GeneticSearch::Population newCandidates(
         std::distance(firstNew, population.end()));
     std::move(firstNew, population.end(), newCandidates.begin());