From 446b19b4ea85e1a15cf8443c86bcb28803277e70 Mon Sep 17 00:00:00 2001
From: Theodoros Theodoridis <theodoridisgr@gmail.com>
Date: Tue, 13 Mar 2018 11:12:37 +0100
Subject: [PATCH 1/5] [genetic search] Scale fitness before selection

Using an unscaled fitness value for selection is problematic:

-Outstanding individuals take over very quickly, this leads to
premature convergence.

-When fitness values are close together, very litle selection
pressure is applied and selection is almost uniformly random.
Having slightly better fitness does not improve an individual's
survival chances.

-Transposing the fitness function (e.g. adding a constant value)
changes the selection probabilities even though the location of the
optimum (and the "shape" of the fitness) remain unchanged.

Scaling the fitness function helps ameliorate those issues.

Sigma scaling is used:

fitness' = max(fitness - (mean_fitness - 2 * std_fitness), 0)
---
 tc/autotuner/genetic_search.cc | 31 ++++++++++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)
diff --git a/tc/autotuner/genetic_search.cc b/tc/autotuner/genetic_search.cc
index b004b4916..742da64e3 100644
--- a/tc/autotuner/genetic_search.cc
+++ b/tc/autotuner/genetic_search.cc
@@ -16,6 +16,8 @@
 
 #include "tc/autotuner/genetic_search.h"
 
+#include <algorithm>
+#include <numeric>
 #include <random>
 #include <sstream>
 
@@ -74,9 +76,35 @@ void mutate(
   }
 }
 
-void normalizeVector(std::vector<double>& v) {
+double mean(std::vector<double>& v) {
+  if (v.empty()) {
+    throw std::invalid_argument("Cannot compute the mean of an empty vector.");
+  }
   auto sum = std::accumulate(v.begin(), v.end(), 0.0);
+  return sum / v.size();
+}
+
+double stdv(std::vector<double>& v, double mean) {
+  std::vector<double> diffs(v.size());
+  std::transform(v.begin(), v.end(), diffs.begin(), [mean](double val) {
+    return val - mean;
+  });
+
+  auto squareSum =
+      std::inner_product(diffs.begin(), diffs.end(), diffs.begin(), 0.0);
+  return std::sqrt(squareSum / v.size());
+}
+
+void sigmaScale(std::vector<double>& v) {
+  auto m = mean(v);
+  auto s = stdv(v, m);
+  std::transform(v.begin(), v.end(), v.begin(), [m, s](double val) {
+    return std::max(val - (m - 2 * s), 0.0);
+  });
+}
 
+void normalizeVector(std::vector<double>& v) {
+  auto sum = std::accumulate(v.begin(), v.end(), 0.0);
   std::transform(
       v.begin(), v.end(), v.begin(), [sum](double v) { return v / sum; });
 }
@@ -92,6 +120,7 @@ std::vector<double> computeNormalizedFitness(
       [](const std::unique_ptr<CandidateConfiguration>& c) {
         return 1.0 / c->runtime.toMicroSeconds();
       });
+  sigmaScale(fitness);
   normalizeVector(fitness);
   return fitness;
 }

From d23898e5aa698a49b9adf68b781090a819862fce Mon Sep 17 00:00:00 2001
From: Theodoros Theodoridis <theodoridisgr@gmail.com>
Date: Thu, 15 Mar 2018 10:14:40 +0100
Subject: [PATCH 2/5] [genetic search] Fix bug that disabled crossover

---
 tc/autotuner/genetic_search.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tc/autotuner/genetic_search.cc b/tc/autotuner/genetic_search.cc
index 742da64e3..924597dd6 100644
--- a/tc/autotuner/genetic_search.cc
+++ b/tc/autotuner/genetic_search.cc
@@ -258,9 +258,9 @@ void GeneticSearch::breed() {
   auto accFitness = computeAccumulatedFitness(population);
   Population new_population;
   new_population.reserve(maxPopulationSize);
-  for (auto& p : population) {
+  for (size_t c = 0; c < numberElites; ++c) {
     new_population.push_back(
-        make_unique<CandidateConfiguration>(p->configuration));
+        make_unique<CandidateConfiguration>(population.at(c)->configuration));
   }
 
   auto select = [&]() -> const TuningConfiguration& {

From cfad85e8ba0656430d3a5fa27ff3e23b45b29010 Mon Sep 17 00:00:00 2001
From: Theodoros Theodoridis <theodoridisgr@gmail.com>
Date: Thu, 15 Mar 2018 10:15:34 +0100
Subject: [PATCH 3/5] [genetic search] Switch to Stochastic Universal Sampling

Stochastic Universal Sampling is an improvement upon the roulette
algorithm that was previously used
---
 tc/autotuner/genetic_search.cc | 34 ++++++++++++++++++++++++++++------
 tc/autotuner/genetic_search.h  |  3 +++
 2 files changed, 31 insertions(+), 6 deletions(-)

diff --git a/tc/autotuner/genetic_search.cc b/tc/autotuner/genetic_search.cc
index 924597dd6..840ac4cdb 100644
--- a/tc/autotuner/genetic_search.cc
+++ b/tc/autotuner/genetic_search.cc
@@ -195,6 +195,7 @@ GeneticSearch::GeneticSearch(
       lastBestConf(confs[0]),
       numGenerations(numGenerations),
       maxPopulationSize(populationSize),
+      matingPoolSize(populationSize * 3),
       crossOverRate(crossOverRate),
       mutationRate(mutationRate),
       numberElites(numberElites),
@@ -254,19 +255,40 @@ TuningConfiguration GeneticSearch::crossover(
   return a;
 }
 
+std::vector<TuningConfiguration> GeneticSearch::stochasticUniversalSampling(
+    const std::vector<double>& fitness) const {
+  std::vector<TuningConfiguration> matingPool;
+  matingPool.reserve(matingPoolSize);
+
+  auto r = std::uniform_real_distribution<double>(0, 1.0 / matingPoolSize)(rng);
+  size_t count = 0;
+  size_t i = 0;
+  while (count < matingPoolSize) {
+    while (r <= fitness[i]) {
+      matingPool.push_back(population[i]->configuration);
+      r += 1.0 / matingPoolSize;
+      ++count;
+    }
+    ++i;
+  }
+  return matingPool;
+}
+
 void GeneticSearch::breed() {
-  auto accFitness = computeAccumulatedFitness(population);
+  auto matingPool =
+      stochasticUniversalSampling(computeAccumulatedFitness(population));
+
   Population new_population;
-  new_population.reserve(maxPopulationSize);
+  new_population.reserve(matingPoolSize);
   for (size_t c = 0; c < numberElites; ++c) {
     new_population.push_back(
         make_unique<CandidateConfiguration>(population.at(c)->configuration));
   }
 
-  auto select = [&]() -> const TuningConfiguration& {
-    auto limit = std::uniform_real_distribution<double>{}(rng);
-    auto lb = std::lower_bound(accFitness.begin(), accFitness.end(), limit);
-    return population.at(std::distance(accFitness.begin(), lb))->configuration;
+  auto select = [&]() -> TuningConfiguration& {
+    auto idx = std::uniform_int_distribution<size_t>{
+        size_t(0), matingPool.size() - 1}(rng);
+    return matingPool.at(idx);
   };
   auto shouldCrossOver = [&]() -> bool {
     /*
diff --git a/tc/autotuner/genetic_search.h b/tc/autotuner/genetic_search.h
index 5c5f7f02d..16d29e2b8 100644
--- a/tc/autotuner/genetic_search.h
+++ b/tc/autotuner/genetic_search.h
@@ -79,6 +79,8 @@ class GeneticSearch {
   void updateParameters();
 
  private:
+  std::vector<TuningConfiguration> stochasticUniversalSampling(
+      const std::vector<double>& fitness) const;
   void breed();
 
   TuningConfiguration crossover(
@@ -96,6 +98,7 @@ class GeneticSearch {
   TuningConfiguration lastBestConf;
   const size_t numGenerations;
   const size_t maxPopulationSize;
+  const size_t matingPoolSize;
   const uint8_t crossOverRate;
   const uint8_t mutationRate;
   const size_t numberElites;

From 43ac05439aebad5b0290cb7be9f97479e2bf76af Mon Sep 17 00:00:00 2001
From: nicolasvasilache <nicolas.vasilache@gmail.com>
Date: Sun, 10 Jun 2018 20:19:56 -0600
Subject: [PATCH 4/5] Update tuning command on FAIR cluster

This is needed following the changes to the build system.
---
 tc/benchmarks/scripts/autotuner_parallel.sh | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tc/benchmarks/scripts/autotuner_parallel.sh b/tc/benchmarks/scripts/autotuner_parallel.sh
index 07b351a2b..a76d374ba 100755
--- a/tc/benchmarks/scripts/autotuner_parallel.sh
+++ b/tc/benchmarks/scripts/autotuner_parallel.sh
@@ -8,7 +8,12 @@
 #SBATCH --gres=gpu:2
 #SBATCH --partition=priority,uninterrupted,learnfair,scavenge
 
-export TUNER_THREADS=${TUNER_THREADS:=20}
+module load cuda/9.0
+. ${HOME}/anaconda/bin/activate
+conda activate tc_build
+export LD_LIBRARY_PATH=${CONDA_PREFIX}/lib:${LD_LIBRARY_PATH}
+
+export TUNER_THREADS=${TUNER_THREADS:=8}
 export TUNER_DEVICES=${TUNER_DEVICES:="0,1"}
 export DEVICE_NAME=$(nvidia-smi -L | head -n 1 | cut -d'(' -f 1 | cut -d':' -f 2 | sed "s/ //g")
 

From 380c0542297081a7d3a250c03ec10e21bfb7688a Mon Sep 17 00:00:00 2001
From: nicolasvasilache <nicolas.vasilache@gmail.com>
Date: Sun, 10 Jun 2018 21:59:21 -0600
Subject: [PATCH 5/5] Drop wavenet2 for now

It is too slow to converge and does not yield perf gains
---
 tc/benchmarks/scripts/AUTOTUNER_COMMANDS    | 4 ++--
 tc/benchmarks/scripts/autotuner_parallel.sh | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/tc/benchmarks/scripts/AUTOTUNER_COMMANDS b/tc/benchmarks/scripts/AUTOTUNER_COMMANDS
index 78347c632..9c4aa582b 100644
--- a/tc/benchmarks/scripts/AUTOTUNER_COMMANDS
+++ b/tc/benchmarks/scripts/AUTOTUNER_COMMANDS
@@ -153,8 +153,8 @@ echo CUDA_LAUNCH_BLOCKING=1 ./build/tc/benchmarks/benchmark_wavenet --gtest_filt
 #
 #
 #
-echo CUDA_LAUNCH_BLOCKING=1 ./build/tc/benchmarks/benchmark_wavenet --gtest_filter="*.WaveNet2" --B=1 --RESIDUAL_C=32 --DILATION_C=32 --SKIP_C=256 --RECEPTIVE_FIELD=4000 --DILATION_FACTOR=1 --debug_tuner=true --dump_cuda=true --disable_version_checks=true --log_dir=${LOG_DIR} --autotune=true --tuner_gen_log_generations=true --tuner_threads=${TUNER_THREADS} --tuner_devices="${TUNER_DEVICES}" --save_tuner_proto_prefix=${LOG_DIR}/ >> ${LOG_DIR}/wavenet2_B_1_RES_32_DIL_32_SKIP_256_REC_4000_F_1.log 2>&1 && CUDA_LAUNCH_BLOCKING=1 ./build/tc/benchmarks/benchmark_wavenet --gtest_filter="*.WaveNet2" --B=1 --RESIDUAL_C=32 --DILATION_C=32 --SKIP_C=256 --RECEPTIVE_FIELD=4000 --DILATION_FACTOR=1 --debug_tuner=true --dump_cuda=true --disable_version_checks=true --log_dir=${LOG_DIR} --autotune=true --tuner_gen_log_generations=true --tuner_threads=${TUNER_THREADS} --tuner_devices="${TUNER_DEVICES}" --save_tuner_proto_prefix=${LOG_DIR}/ >> ${LOG_DIR}/wavenet2_B_1_RES_32_DIL_32_SKIP_256_REC_4000_F_1.log 2>&1
+#echo CUDA_LAUNCH_BLOCKING=1 ./build/tc/benchmarks/benchmark_wavenet --gtest_filter="*.WaveNet2" --B=1 --RESIDUAL_C=32 --DILATION_C=32 --SKIP_C=256 --RECEPTIVE_FIELD=4000 --DILATION_FACTOR=1 --debug_tuner=true --dump_cuda=true --disable_version_checks=true --log_dir=${LOG_DIR} --autotune=true --tuner_gen_log_generations=true --tuner_threads=${TUNER_THREADS} --tuner_devices="${TUNER_DEVICES}" --save_tuner_proto_prefix=${LOG_DIR}/ >> ${LOG_DIR}/wavenet2_B_1_RES_32_DIL_32_SKIP_256_REC_4000_F_1.log 2>&1 && CUDA_LAUNCH_BLOCKING=1 ./build/tc/benchmarks/benchmark_wavenet --gtest_filter="*.WaveNet2" --B=1 --RESIDUAL_C=32 --DILATION_C=32 --SKIP_C=256 --RECEPTIVE_FIELD=4000 --DILATION_FACTOR=1 --debug_tuner=true --dump_cuda=true --disable_version_checks=true --log_dir=${LOG_DIR} --autotune=true --tuner_gen_log_generations=true --tuner_threads=${TUNER_THREADS} --tuner_devices="${TUNER_DEVICES}" --save_tuner_proto_prefix=${LOG_DIR}/ >> ${LOG_DIR}/wavenet2_B_1_RES_32_DIL_32_SKIP_256_REC_4000_F_1.log 2>&1
 #
 #
 #
-echo CUDA_LAUNCH_BLOCKING=1 ./build/tc/benchmarks/benchmark_wavenet --gtest_filter="*.WaveNet2" --B=1 --RESIDUAL_C=32 --DILATION_C=32 --SKIP_C=256 --RECEPTIVE_FIELD=4000 --DILATION_FACTOR=32 --debug_tuner=true --dump_cuda=true --disable_version_checks=true --log_dir=${LOG_DIR} --autotune=true --tuner_gen_log_generations=true --tuner_threads=${TUNER_THREADS} --tuner_devices="${TUNER_DEVICES}" --save_tuner_proto_prefix=${LOG_DIR}/ >> ${LOG_DIR}/wavenet2_B_1_RES_32_DIL_32_SKIP_256_REC_4000_F_32.log 2>&1 && CUDA_LAUNCH_BLOCKING=1 ./build/tc/benchmarks/benchmark_wavenet --gtest_filter="*.WaveNet2" --B=1 --RESIDUAL_C=32 --DILATION_C=32 --SKIP_C=256 --RECEPTIVE_FIELD=4000 --DILATION_FACTOR=32 --debug_tuner=true --dump_cuda=true --disable_version_checks=true --log_dir=${LOG_DIR} --autotune=true --tuner_gen_log_generations=true --tuner_threads=${TUNER_THREADS} --tuner_devices="${TUNER_DEVICES}" --save_tuner_proto_prefix=${LOG_DIR}/ >> ${LOG_DIR}/wavenet2_B_1_RES_32_DIL_32_SKIP_256_REC_4000_F_32.log 2>&1
\ No newline at end of file
+#echo CUDA_LAUNCH_BLOCKING=1 ./build/tc/benchmarks/benchmark_wavenet --gtest_filter="*.WaveNet2" --B=1 --RESIDUAL_C=32 --DILATION_C=32 --SKIP_C=256 --RECEPTIVE_FIELD=4000 --DILATION_FACTOR=32 --debug_tuner=true --dump_cuda=true --disable_version_checks=true --log_dir=${LOG_DIR} --autotune=true --tuner_gen_log_generations=true --tuner_threads=${TUNER_THREADS} --tuner_devices="${TUNER_DEVICES}" --save_tuner_proto_prefix=${LOG_DIR}/ >> ${LOG_DIR}/wavenet2_B_1_RES_32_DIL_32_SKIP_256_REC_4000_F_32.log 2>&1 && CUDA_LAUNCH_BLOCKING=1 ./build/tc/benchmarks/benchmark_wavenet --gtest_filter="*.WaveNet2" --B=1 --RESIDUAL_C=32 --DILATION_C=32 --SKIP_C=256 --RECEPTIVE_FIELD=4000 --DILATION_FACTOR=32 --debug_tuner=true --dump_cuda=true --disable_version_checks=true --log_dir=${LOG_DIR} --autotune=true --tuner_gen_log_generations=true --tuner_threads=${TUNER_THREADS} --tuner_devices="${TUNER_DEVICES}" --save_tuner_proto_prefix=${LOG_DIR}/ >> ${LOG_DIR}/wavenet2_B_1_RES_32_DIL_32_SKIP_256_REC_4000_F_32.log 2>&1
\ No newline at end of file
diff --git a/tc/benchmarks/scripts/autotuner_parallel.sh b/tc/benchmarks/scripts/autotuner_parallel.sh
index a76d374ba..0ba8e5765 100755
--- a/tc/benchmarks/scripts/autotuner_parallel.sh
+++ b/tc/benchmarks/scripts/autotuner_parallel.sh
@@ -28,4 +28,5 @@ cat ${TC_PREFIX}/tc/benchmarks/scripts/AUTOTUNER_COMMANDS | grep -v "\#" | head
 cat ${TC_PREFIX}/tc/benchmarks/scripts/AUTOTUNER_COMMANDS | grep -v "\#" | head -n ${SLURM_ARRAY_TASK_ID} | tail -n 1 | xargs -i bash -c "{}"
 
 # Run with:
-# sbatch --array=1-40 -C volta ./tc/benchmarks/scripts/autotuner_parallel.sh
+# export NUM_TO_RUN=$(cat ${TC_PREFIX}/tc/benchmarks/scripts/AUTOTUNER_COMMANDS | grep -v "\#" | wc -l)
+# sbatch --array=1-${NUM_TO_RUN} -C volta ./tc/benchmarks/scripts/autotuner_parallel.sh