facebookresearch
diff --git a/‎docs/source/framework/pytorch_integration/autotuning_layers.rst
Lines changed: 3 additions & 4 deletions b/‎docs/source/framework/pytorch_integration/autotuning_layers.rst
Lines changed: 3 additions & 4 deletions
diff --git a/‎docs/source/tutorials/tutorial_tensordot_with_tc.rst
Lines changed: 1 addition & 1 deletion b/‎docs/source/tutorials/tutorial_tensordot_with_tc.rst
Lines changed: 1 addition & 1 deletion
diff --git a/‎tc/autotuner/autotuner-inl.h
Lines changed: 51 additions & 48 deletions b/‎tc/autotuner/autotuner-inl.h
Lines changed: 51 additions & 48 deletions
diff --git a/‎tc/autotuner/genetic_search.cc
Lines changed: 87 additions & 50 deletions b/‎tc/autotuner/genetic_search.cc
Lines changed: 87 additions & 50 deletions
@@ -58,7 +58,6 @@ You can read about all the parameters here - :ref:`autotuner_parameters`.
 - :code:`threads` - set this to number of CPU cores available.
 - :code:`generations` - 5 to 10 generations is a good number.
 - :code:`pop_size` - 10 is usually reasonable. You can try 10 to 20.
-- :code:`number_elites` - number of candidates preserved intact between generations. `1` is usually sufficient.
 - :code:`min_launch_total_threads` - If you have really input small sizes, set this to `1`.
 - :code:`gpus`: Number of gpus to use for autotuning. Default value is "0". Set this to "0,1" if you wish to use two gpus (for example).
 
@@ -70,15 +69,15 @@ kernel timing. You can adopt the following parameter settings as starters for au
 .. code::
 
      settings = {
-         "threads": 32, "generations": 2, "pop_size": 10, "number_elites": 1
+         "threads": 32, "generations": 2, "pop_size": 10
      }
 
 * The good defaults that run for a bit longer (in exchange for better performance):
 
 .. code::
 
      settings = {
-         "threads": 32, "generations": 5, "pop_size": 10, "number_elites": 1
+         "threads": 32, "generations": 5, "pop_size": 10
      }
 
 
@@ -87,7 +86,7 @@ kernel timing. You can adopt the following parameter settings as starters for au
 .. code::
 
      settings = {
-         "threads": 32, "generations": 25, "pop_size": 100, "number_elites": 10
+         "threads": 32, "generations": 25, "pop_size": 100
      }
 
 
 
@@ -132,7 +132,7 @@ later.
 You can control the amount of autotuning by changing the autotuner parameters. See
 :ref:`autotune_parameters` for how to change the settings.
 
-For the setting ``settings={"generations": 25, "pop_size": 100, "number_elites": 10}``, we
+For the setting ``settings={"generations": 25, "pop_size": 100}``, we
 get a decent kernel performance as shown in the screenshot below (tuned on one M40 GPU):
 
 .. figure:: ../_static/img/autotuning-py.jpg
 
@@ -243,56 +243,58 @@ void TuningHarness<Backend>::runOneIteration(
     size_t iteration) {
   // Define tensors per device once globally
   auto devices = detail::parseDevices<Backend>(FLAGS_tuner_devices);
-  CHECK(executors_.empty());
-  CHECK(configurations_.empty());
   for (uint64_t step = 0; step < searchStrategy.stepsPerIteration; ++step) {
-    auto& candidates = searchStrategy.candidatesOfStep(step);
-    // Initialize for this round
-    currentCompilationJob_.store(0);
-    numEvaluations_.store(0);
-    Printer printer(
-        iteration,
-        step,
-        candidates.size(),
-        currentCompilationJob_,
-        numEvaluations_);
-    auto logIterations = FLAGS_tuner_gen_log_generations;
-    ScopeGuard sgPrinter([logIterations, &printer]() {
-      printer.stop();
-      if (logIterations) {
-        printer.printAll();
-      }
-    });
-
-    // Just spawn and join new threads for each iteration
-    std::vector<std::thread> cpuCompilationThreads;
-    cpuCompilationThreads.reserve(FLAGS_tuner_threads);
-    ScopeGuard sgCompilationThreads([&cpuCompilationThreads]() {
-      for (auto& cpuCompilationThread : cpuCompilationThreads) {
-        cpuCompilationThread.join();
-      }
-    });
-    for (size_t i = 0; i < FLAGS_tuner_threads; ++i) {
-      cpuCompilationThreads.emplace_back(
-          [this, &candidates]() { this->doCompile(candidates); });
-    }
+    {
+      CHECK(executors_.empty());
+      CHECK(configurations_.empty());
+      auto& candidates = searchStrategy.candidatesOfStep(step);
+      // Initialize for this round
+      currentCompilationJob_.store(0);
+      numEvaluations_.store(0);
+      Printer printer(
+          iteration,
+          step,
+          candidates.size(),
+          currentCompilationJob_,
+          numEvaluations_);
+      auto logIterations = FLAGS_tuner_gen_log_generations;
+      ScopeGuard sgPrinter([logIterations, &printer]() {
+        printer.stop();
+        if (logIterations) {
+          printer.printAll();
+        }
+      });
 
-    // Just spawn and join new threads for each device
-    std::vector<std::thread> workerThreads;
-    workerThreads.reserve(devices.size());
-    LOG_IF(INFO, tc::FLAGS_debug_tuner)
-        << "Start evaluation: " << devices.size() << " " << executors_.size()
-        << " " << configurations_.size();
-    ScopeGuard sgDeviceWorkerThreads([&workerThreads]() {
-      for (auto& workerThread : workerThreads) {
-        workerThread.join();
+      // Just spawn and join new threads for each iteration
+      std::vector<std::thread> cpuCompilationThreads;
+      cpuCompilationThreads.reserve(FLAGS_tuner_threads);
+      ScopeGuard sgCompilationThreads([&cpuCompilationThreads]() {
+        for (auto& cpuCompilationThread : cpuCompilationThreads) {
+          cpuCompilationThread.join();
+        }
+      });
+      for (size_t i = 0; i < FLAGS_tuner_threads; ++i) {
+        cpuCompilationThreads.emplace_back(
+            [this, &candidates]() { this->doCompile(candidates); });
       }
-    });
-    auto populationSize = candidates.size();
-    for (auto device : devices) {
-      workerThreads.emplace_back([this, device, populationSize, &printer]() {
-        this->doEvaluate(device, populationSize, printer);
+
+      // Just spawn and join new threads for each device
+      std::vector<std::thread> workerThreads;
+      workerThreads.reserve(devices.size());
+      LOG_IF(INFO, tc::FLAGS_debug_tuner)
+          << "Start evaluation: " << devices.size() << " " << executors_.size()
+          << " " << configurations_.size();
+      ScopeGuard sgDeviceWorkerThreads([&workerThreads]() {
+        for (auto& workerThread : workerThreads) {
+          workerThread.join();
+        }
       });
+      auto populationSize = candidates.size();
+      for (auto device : devices) {
+        workerThreads.emplace_back([this, device, populationSize, &printer]() {
+          this->doEvaluate(device, populationSize, printer);
+        });
+      }
     }
     searchStrategy.finishStep(step);
   }
@@ -305,7 +307,6 @@ void TuningHarness<Backend>::runOneIteration(
     infoPrinter << bestMappingOptions();
     LOG_LINE_BY_LINE(INFO, ssInfo);
   }
-  searchStrategy.updateParameters();
 }
 } // namespace detail
 
@@ -462,13 +463,15 @@ Autotuner<Backend, SearchStrategy>::tune(
       });
 
   // searchStrategy is passed to tuningHarness.run()
+  // XXX: this not generic
   SearchStrategy searchStrategy(
       configs,
       FLAGS_tuner_gen_generations,
       FLAGS_tuner_gen_pop_size,
       FLAGS_tuner_gen_crossover_rate,
       FLAGS_tuner_gen_mutation_rate,
-      FLAGS_tuner_gen_number_elites);
+      FLAGS_tuner_gen_mating_pool_size,
+      FLAGS_tuner_gen_selection_pool_size);
 
   // Create a tuning harness
   detail::TuningHarness<Backend> tuningHarness(
 
@@ -33,11 +33,8 @@ void randomizeParameter(Parameter& param, RNG& rng) {
   param.selectOption(paramIndex);
 }
 
-template <typename RNG>
-void randomizePopulation(
-    GeneticSearch::Population::iterator begin,
-    GeneticSearch::Population::iterator end,
-    RNG& rng) {
+template <typename RNG, typename Iterator>
+void randomizePopulation(Iterator begin, Iterator end, RNG& rng) {
   for (auto candidate = begin; candidate != end; ++candidate) {
     auto& conf = (*candidate)->configuration;
     do {
@@ -160,7 +157,8 @@ void dropInvalidConfigurations(GeneticSearch::Population& population) {
 } // namespace
 
 #define VALIDATE()                                   \
-  CHECK_LT(numberElites, maxPopulationSize);         \
+  CHECK_LT(maxPopulationSize, matingPoolSize);       \
+  CHECK_LT(maxPopulationSize, selectionPoolSize);    \
   CHECK(mutationRate >= 0 and mutationRate <= 100)   \
       << "the mutation rate (" << mutationRate       \
       << ") should be in the [0,100] interval";      \
@@ -188,15 +186,16 @@ GeneticSearch::GeneticSearch(
     size_t populationSize,
     uint8_t crossOverRate,
     uint8_t mutationRate,
-    size_t numberElites)
+    size_t matingPoolSize,
+    size_t selectionPoolSize)
     : population(),
       lastBestConf(confs[0]),
       numGenerations(numGenerations),
       maxPopulationSize(populationSize),
-      matingPoolSize(populationSize * 3),
+      matingPoolSize(matingPoolSize),
+      selectionPoolSize(selectionPoolSize),
       crossOverRate(crossOverRate),
       mutationRate(mutationRate),
-      numberElites(numberElites),
       rng{std::random_device{}()} {
   restoreRngState(rng);
   VALIDATE();
@@ -276,13 +275,6 @@ void GeneticSearch::breed() {
   auto matingPool =
       stochasticUniversalSampling(computeAccumulatedFitness(population));
 
-  Population new_population;
-  new_population.reserve(matingPoolSize);
-  for (size_t c = 0; c < numberElites; ++c) {
-    new_population.push_back(
-        make_unique<CandidateConfiguration>(population.at(c)->configuration));
-  }
-
   auto select = [&]() -> TuningConfiguration& {
     auto idx = std::uniform_int_distribution<size_t>{
         size_t(0), matingPool.size() - 1}(rng);
@@ -298,39 +290,20 @@ void GeneticSearch::breed() {
     return dist(rng);
   };
 
-  while (new_population.size() < maxPopulationSize) {
+  while (selectionPool.size() < selectionPoolSize) {
     if (shouldCrossOver()) {
       auto parent1 = select();
       auto parent2 = select();
       auto parent3 = select();
-      new_population.emplace_back(make_unique<CandidateConfiguration>(
+      selectionPool.emplace_back(make_unique<CandidateConfiguration>(
           crossover(parent1, parent2, parent3)));
     } else {
-      new_population.emplace_back(
-          make_unique<CandidateConfiguration>(select()));
+      selectionPool.emplace_back(make_unique<CandidateConfiguration>(select()));
     }
   }
-  population = std::move(new_population);
 }
 
-void GeneticSearch::updateParameters() {
-  dropInvalidConfigurations(population);
-
-  // Sort population before taking any decision
-  std::sort(
-      population.begin(),
-      population.end(),
-      [](const std::unique_ptr<CandidateConfiguration>& a,
-         const std::unique_ptr<CandidateConfiguration>& b) {
-        checkRuntimeRecorded(a->runtime);
-        checkRuntimeRecorded(b->runtime);
-        return a->runtime < b->runtime;
-      });
-
-  // Update failsafe lastBestConf
-  lastBestConf =
-      population.size() > 0 ? population.front()->configuration : lastBestConf;
-
+bool GeneticSearch::resetPopulationIfNotEnoughCandidates() {
   if (population.size() < minCandidatesForBreeding) {
     LOG_IF(ERROR, FLAGS_debug_tuner)
         << population.size() << " out of " << maxPopulationSize
@@ -341,30 +314,94 @@ void GeneticSearch::updateParameters() {
            "--tuner_min_launch_total_threads=1. This is mostly relevant "
            "when autotuning a TC operating on small tensors. The next "
            "generation will be randomly initialized.";
-    population.resize(0);
-    for (size_t i = 0; i < maxPopulationSize; ++i) {
-      population.emplace_back(
+    selectionPool.clear();
+    for (size_t i = 0; i < selectionPoolSize; ++i) {
+      selectionPool.emplace_back(
           make_unique<CandidateConfiguration>(lastBestConf));
     }
     // Don't lose the first one which was the best from before
-    CHECK_LT(0u, population.size());
-    randomizePopulation(population.begin() + 1, population.end(), rng);
-    return;
+    randomizePopulation(selectionPool.begin() + 1, selectionPool.end(), rng);
+    return true;
   }
+  return false;
+}
+
+namespace {
+void sortByRuntime(GeneticSearch::Population& population) {
+  std::sort(
+      population.begin(),
+      population.end(),
+      [](const std::unique_ptr<CandidateConfiguration>& a,
+         const std::unique_ptr<CandidateConfiguration>& b) {
+        checkRuntimeRecorded(a->runtime);
+        checkRuntimeRecorded(b->runtime);
+        return a->runtime < b->runtime;
+      });
+}
+} // namespace
 
+void GeneticSearch::generateSelectionPool() {
+  dropInvalidConfigurations(population);
+  sortByRuntime(population);
+  lastBestConf =
+      population.size() > 0 ? population.front()->configuration : lastBestConf;
+  if (resetPopulationIfNotEnoughCandidates()) {
+    return;
+  }
   breed();
-  for (size_t i = numberElites; i < population.size(); ++i) {
-    mutate(*population[i], mutationRate, mutateIterations, rng);
+  selectionPool.clear();
+  selectionPool.emplace_back(make_unique<CandidateConfiguration>(lastBestConf));
+  breed();
+  for (size_t i = 1; i < selectionPool.size(); ++i) {
+    mutate(*selectionPool[i], mutationRate, mutateIterations, rng);
+  }
+}
+
+void GeneticSearch::selectSurvivors() {
+  dropInvalidConfigurations(selectionPool);
+  sortByRuntime(selectionPool);
+  population.clear();
+  std::transform(
+      selectionPool.begin(),
+      selectionPool.begin() + std::min(selectionPool.size(), maxPopulationSize),
+      std::back_inserter(population),
+      [](const std::unique_ptr<CandidateConfiguration>& c) {
+        return make_unique<CandidateConfiguration>(c->configuration);
+      });
+
+  if (selectionPool.size() < maxPopulationSize) {
+    auto numberMissing = maxPopulationSize - selectionPool.size();
+
+    for (size_t i = 0; i < numberMissing; ++i) {
+      selectionPool.emplace_back(
+          make_unique<CandidateConfiguration>(lastBestConf));
+    }
+    randomizePopulation(
+        selectionPool.rbegin(), selectionPool.rbegin() + numberMissing, rng);
   }
 }
 
 GeneticSearch::Population& GeneticSearch::candidatesOfStep(uint64_t step) {
-  if (step != 0) {
-    throw std::invalid_argument("GeneticSearch has only one step");
+  if (step > 1) {
+    throw std::invalid_argument("GeneticSearch has only 2 steps.");
+  }
+  if (step == 0) {
+    return population;
+  } else {
+    return selectionPool;
   }
-  return population;
 }
 
+void GeneticSearch::finishStep(uint64_t step) {
+  if (step > 1) {
+    throw std::invalid_argument("GeneticSearch has only 2 steps.");
+  }
+  if (step == 0) {
+    generateSelectionPool();
+  } else {
+    selectSurvivors();
+  }
+}
 } // namespace autotune
 } // namespace tc