diff --git a/tc/autotuner/genetic_search.cc b/tc/autotuner/genetic_search.cc
index b004b4916..840ac4cdb 100644
--- a/tc/autotuner/genetic_search.cc
+++ b/tc/autotuner/genetic_search.cc
@@ -16,6 +16,8 @@
 
 #include "tc/autotuner/genetic_search.h"
 
+#include <algorithm>
+#include <numeric>
 #include <random>
 #include <sstream>
 
@@ -74,9 +76,35 @@ void mutate(
   }
 }
 
-void normalizeVector(std::vector<double>& v) {
+double mean(std::vector<double>& v) {
+  if (v.empty()) {
+    throw std::invalid_argument("Cannot compute the mean of an empty vector.");
+  }
   auto sum = std::accumulate(v.begin(), v.end(), 0.0);
+  return sum / v.size();
+}
+
+double stdv(std::vector<double>& v, double mean) {
+  std::vector<double> diffs(v.size());
+  std::transform(v.begin(), v.end(), diffs.begin(), [mean](double val) {
+    return val - mean;
+  });
+
+  auto squareSum =
+      std::inner_product(diffs.begin(), diffs.end(), diffs.begin(), 0.0);
+  return std::sqrt(squareSum / v.size());
+}
+
+void sigmaScale(std::vector<double>& v) {
+  auto m = mean(v);
+  auto s = stdv(v, m);
+  std::transform(v.begin(), v.end(), v.begin(), [m, s](double val) {
+    return std::max(val - (m - 2 * s), 0.0);
+  });
+}
 
+void normalizeVector(std::vector<double>& v) {
+  auto sum = std::accumulate(v.begin(), v.end(), 0.0);
   std::transform(
       v.begin(), v.end(), v.begin(), [sum](double v) { return v / sum; });
 }
@@ -92,6 +120,7 @@ std::vector<double> computeNormalizedFitness(
       [](const std::unique_ptr<CandidateConfiguration>& c) {
         return 1.0 / c->runtime.toMicroSeconds();
       });
+  sigmaScale(fitness);
   normalizeVector(fitness);
   return fitness;
 }
@@ -166,6 +195,7 @@ GeneticSearch::GeneticSearch(
       lastBestConf(confs[0]),
       numGenerations(numGenerations),
       maxPopulationSize(populationSize),
+      matingPoolSize(populationSize * 3),
       crossOverRate(crossOverRate),
       mutationRate(mutationRate),
       numberElites(numberElites),
@@ -225,19 +255,40 @@ TuningConfiguration GeneticSearch::crossover(
   return a;
 }
 
+std::vector<TuningConfiguration> GeneticSearch::stochasticUniversalSampling(
+    const std::vector<double>& fitness) const {
+  std::vector<TuningConfiguration> matingPool;
+  matingPool.reserve(matingPoolSize);
+
+  auto r = std::uniform_real_distribution<double>(0, 1.0 / matingPoolSize)(rng);
+  size_t count = 0;
+  size_t i = 0;
+  while (count < matingPoolSize) {
+    while (r <= fitness[i]) {
+      matingPool.push_back(population[i]->configuration);
+      r += 1.0 / matingPoolSize;
+      ++count;
+    }
+    ++i;
+  }
+  return matingPool;
+}
+
 void GeneticSearch::breed() {
-  auto accFitness = computeAccumulatedFitness(population);
+  auto matingPool =
+      stochasticUniversalSampling(computeAccumulatedFitness(population));
+
   Population new_population;
-  new_population.reserve(maxPopulationSize);
-  for (auto& p : population) {
+  new_population.reserve(matingPoolSize);
+  for (size_t c = 0; c < numberElites; ++c) {
     new_population.push_back(
-        make_unique<CandidateConfiguration>(p->configuration));
+        make_unique<CandidateConfiguration>(population.at(c)->configuration));
   }
 
-  auto select = [&]() -> const TuningConfiguration& {
-    auto limit = std::uniform_real_distribution<double>{}(rng);
-    auto lb = std::lower_bound(accFitness.begin(), accFitness.end(), limit);
-    return population.at(std::distance(accFitness.begin(), lb))->configuration;
+  auto select = [&]() -> TuningConfiguration& {
+    auto idx = std::uniform_int_distribution<size_t>{
+        size_t(0), matingPool.size() - 1}(rng);
+    return matingPool.at(idx);
   };
   auto shouldCrossOver = [&]() -> bool {
     /*
diff --git a/tc/autotuner/genetic_search.h b/tc/autotuner/genetic_search.h
index 5c5f7f02d..16d29e2b8 100644
--- a/tc/autotuner/genetic_search.h
+++ b/tc/autotuner/genetic_search.h
@@ -79,6 +79,8 @@ class GeneticSearch {
   void updateParameters();
 
  private:
+  std::vector<TuningConfiguration> stochasticUniversalSampling(
+      const std::vector<double>& fitness) const;
   void breed();
 
   TuningConfiguration crossover(
@@ -96,6 +98,7 @@ class GeneticSearch {
   TuningConfiguration lastBestConf;
   const size_t numGenerations;
   const size_t maxPopulationSize;
+  const size_t matingPoolSize;
   const uint8_t crossOverRate;
   const uint8_t mutationRate;
   const size_t numberElites;
diff --git a/tc/benchmarks/scripts/AUTOTUNER_COMMANDS b/tc/benchmarks/scripts/AUTOTUNER_COMMANDS
index 78347c632..9c4aa582b 100644
--- a/tc/benchmarks/scripts/AUTOTUNER_COMMANDS
+++ b/tc/benchmarks/scripts/AUTOTUNER_COMMANDS
@@ -153,8 +153,8 @@ echo CUDA_LAUNCH_BLOCKING=1 ./build/tc/benchmarks/benchmark_wavenet --gtest_filt
 #
 #
 #
-echo CUDA_LAUNCH_BLOCKING=1 ./build/tc/benchmarks/benchmark_wavenet --gtest_filter="*.WaveNet2" --B=1 --RESIDUAL_C=32 --DILATION_C=32 --SKIP_C=256 --RECEPTIVE_FIELD=4000 --DILATION_FACTOR=1 --debug_tuner=true --dump_cuda=true --disable_version_checks=true --log_dir=${LOG_DIR} --autotune=true --tuner_gen_log_generations=true --tuner_threads=${TUNER_THREADS} --tuner_devices="${TUNER_DEVICES}" --save_tuner_proto_prefix=${LOG_DIR}/ >> ${LOG_DIR}/wavenet2_B_1_RES_32_DIL_32_SKIP_256_REC_4000_F_1.log 2>&1 && CUDA_LAUNCH_BLOCKING=1 ./build/tc/benchmarks/benchmark_wavenet --gtest_filter="*.WaveNet2" --B=1 --RESIDUAL_C=32 --DILATION_C=32 --SKIP_C=256 --RECEPTIVE_FIELD=4000 --DILATION_FACTOR=1 --debug_tuner=true --dump_cuda=true --disable_version_checks=true --log_dir=${LOG_DIR} --autotune=true --tuner_gen_log_generations=true --tuner_threads=${TUNER_THREADS} --tuner_devices="${TUNER_DEVICES}" --save_tuner_proto_prefix=${LOG_DIR}/ >> ${LOG_DIR}/wavenet2_B_1_RES_32_DIL_32_SKIP_256_REC_4000_F_1.log 2>&1
+#echo CUDA_LAUNCH_BLOCKING=1 ./build/tc/benchmarks/benchmark_wavenet --gtest_filter="*.WaveNet2" --B=1 --RESIDUAL_C=32 --DILATION_C=32 --SKIP_C=256 --RECEPTIVE_FIELD=4000 --DILATION_FACTOR=1 --debug_tuner=true --dump_cuda=true --disable_version_checks=true --log_dir=${LOG_DIR} --autotune=true --tuner_gen_log_generations=true --tuner_threads=${TUNER_THREADS} --tuner_devices="${TUNER_DEVICES}" --save_tuner_proto_prefix=${LOG_DIR}/ >> ${LOG_DIR}/wavenet2_B_1_RES_32_DIL_32_SKIP_256_REC_4000_F_1.log 2>&1 && CUDA_LAUNCH_BLOCKING=1 ./build/tc/benchmarks/benchmark_wavenet --gtest_filter="*.WaveNet2" --B=1 --RESIDUAL_C=32 --DILATION_C=32 --SKIP_C=256 --RECEPTIVE_FIELD=4000 --DILATION_FACTOR=1 --debug_tuner=true --dump_cuda=true --disable_version_checks=true --log_dir=${LOG_DIR} --autotune=true --tuner_gen_log_generations=true --tuner_threads=${TUNER_THREADS} --tuner_devices="${TUNER_DEVICES}" --save_tuner_proto_prefix=${LOG_DIR}/ >> ${LOG_DIR}/wavenet2_B_1_RES_32_DIL_32_SKIP_256_REC_4000_F_1.log 2>&1
 #
 #
 #
-echo CUDA_LAUNCH_BLOCKING=1 ./build/tc/benchmarks/benchmark_wavenet --gtest_filter="*.WaveNet2" --B=1 --RESIDUAL_C=32 --DILATION_C=32 --SKIP_C=256 --RECEPTIVE_FIELD=4000 --DILATION_FACTOR=32 --debug_tuner=true --dump_cuda=true --disable_version_checks=true --log_dir=${LOG_DIR} --autotune=true --tuner_gen_log_generations=true --tuner_threads=${TUNER_THREADS} --tuner_devices="${TUNER_DEVICES}" --save_tuner_proto_prefix=${LOG_DIR}/ >> ${LOG_DIR}/wavenet2_B_1_RES_32_DIL_32_SKIP_256_REC_4000_F_32.log 2>&1 && CUDA_LAUNCH_BLOCKING=1 ./build/tc/benchmarks/benchmark_wavenet --gtest_filter="*.WaveNet2" --B=1 --RESIDUAL_C=32 --DILATION_C=32 --SKIP_C=256 --RECEPTIVE_FIELD=4000 --DILATION_FACTOR=32 --debug_tuner=true --dump_cuda=true --disable_version_checks=true --log_dir=${LOG_DIR} --autotune=true --tuner_gen_log_generations=true --tuner_threads=${TUNER_THREADS} --tuner_devices="${TUNER_DEVICES}" --save_tuner_proto_prefix=${LOG_DIR}/ >> ${LOG_DIR}/wavenet2_B_1_RES_32_DIL_32_SKIP_256_REC_4000_F_32.log 2>&1
\ No newline at end of file
+#echo CUDA_LAUNCH_BLOCKING=1 ./build/tc/benchmarks/benchmark_wavenet --gtest_filter="*.WaveNet2" --B=1 --RESIDUAL_C=32 --DILATION_C=32 --SKIP_C=256 --RECEPTIVE_FIELD=4000 --DILATION_FACTOR=32 --debug_tuner=true --dump_cuda=true --disable_version_checks=true --log_dir=${LOG_DIR} --autotune=true --tuner_gen_log_generations=true --tuner_threads=${TUNER_THREADS} --tuner_devices="${TUNER_DEVICES}" --save_tuner_proto_prefix=${LOG_DIR}/ >> ${LOG_DIR}/wavenet2_B_1_RES_32_DIL_32_SKIP_256_REC_4000_F_32.log 2>&1 && CUDA_LAUNCH_BLOCKING=1 ./build/tc/benchmarks/benchmark_wavenet --gtest_filter="*.WaveNet2" --B=1 --RESIDUAL_C=32 --DILATION_C=32 --SKIP_C=256 --RECEPTIVE_FIELD=4000 --DILATION_FACTOR=32 --debug_tuner=true --dump_cuda=true --disable_version_checks=true --log_dir=${LOG_DIR} --autotune=true --tuner_gen_log_generations=true --tuner_threads=${TUNER_THREADS} --tuner_devices="${TUNER_DEVICES}" --save_tuner_proto_prefix=${LOG_DIR}/ >> ${LOG_DIR}/wavenet2_B_1_RES_32_DIL_32_SKIP_256_REC_4000_F_32.log 2>&1
\ No newline at end of file
diff --git a/tc/benchmarks/scripts/autotuner_parallel.sh b/tc/benchmarks/scripts/autotuner_parallel.sh
index 07b351a2b..0ba8e5765 100755
--- a/tc/benchmarks/scripts/autotuner_parallel.sh
+++ b/tc/benchmarks/scripts/autotuner_parallel.sh
@@ -8,7 +8,12 @@
 #SBATCH --gres=gpu:2
 #SBATCH --partition=priority,uninterrupted,learnfair,scavenge
 
-export TUNER_THREADS=${TUNER_THREADS:=20}
+module load cuda/9.0
+. ${HOME}/anaconda/bin/activate
+conda activate tc_build
+export LD_LIBRARY_PATH=${CONDA_PREFIX}/lib:${LD_LIBRARY_PATH}
+
+export TUNER_THREADS=${TUNER_THREADS:=8}
 export TUNER_DEVICES=${TUNER_DEVICES:="0,1"}
 export DEVICE_NAME=$(nvidia-smi -L | head -n 1 | cut -d'(' -f 1 | cut -d':' -f 2 | sed "s/ //g")
 
@@ -23,4 +28,5 @@ cat ${TC_PREFIX}/tc/benchmarks/scripts/AUTOTUNER_COMMANDS | grep -v "\#" | head
 cat ${TC_PREFIX}/tc/benchmarks/scripts/AUTOTUNER_COMMANDS | grep -v "\#" | head -n ${SLURM_ARRAY_TASK_ID} | tail -n 1 | xargs -i bash -c "{}"
 
 # Run with:
-# sbatch --array=1-40 -C volta ./tc/benchmarks/scripts/autotuner_parallel.sh
+# export NUM_TO_RUN=$(cat ${TC_PREFIX}/tc/benchmarks/scripts/AUTOTUNER_COMMANDS | grep -v "\#" | wc -l)
+# sbatch --array=1-${NUM_TO_RUN} -C volta ./tc/benchmarks/scripts/autotuner_parallel.sh