From 446b19b4ea85e1a15cf8443c86bcb28803277e70 Mon Sep 17 00:00:00 2001 From: Theodoros Theodoridis Date: Tue, 13 Mar 2018 11:12:37 +0100 Subject: [PATCH 1/5] [genetic search] Scale fitness before selection Using an unscaled fitness value for selection is problematic: -Outstanding individuals take over very quickly, this leads to premature convergence. -When fitness values are close together, very litle selection pressure is applied and selection is almost uniformly random. Having slightly better fitness does not improve an individual's survival chances. -Transposing the fitness function (e.g. adding a constant value) changes the selection probabilities even though the location of the optimum (and the "shape" of the fitness) remain unchanged. Scaling the fitness function helps ameliorate those issues. Sigma scaling is used: fitness' = max(fitness - (mean_fitness - 2 * std_fitness), 0) --- tc/autotuner/genetic_search.cc | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/tc/autotuner/genetic_search.cc b/tc/autotuner/genetic_search.cc index b004b4916..742da64e3 100644 --- a/tc/autotuner/genetic_search.cc +++ b/tc/autotuner/genetic_search.cc @@ -16,6 +16,8 @@ #include "tc/autotuner/genetic_search.h" +#include +#include #include #include @@ -74,9 +76,35 @@ void mutate( } } -void normalizeVector(std::vector& v) { +double mean(std::vector& v) { + if (v.empty()) { + throw std::invalid_argument("Cannot compute the mean of an empty vector."); + } auto sum = std::accumulate(v.begin(), v.end(), 0.0); + return sum / v.size(); +} + +double stdv(std::vector& v, double mean) { + std::vector diffs(v.size()); + std::transform(v.begin(), v.end(), diffs.begin(), [mean](double val) { + return val - mean; + }); + + auto squareSum = + std::inner_product(diffs.begin(), diffs.end(), diffs.begin(), 0.0); + return std::sqrt(squareSum / v.size()); +} + +void sigmaScale(std::vector& v) { + auto m = mean(v); + auto s = stdv(v, m); + std::transform(v.begin(), v.end(), v.begin(), [m, s](double val) { + return std::max(val - (m - 2 * s), 0.0); + }); +} +void normalizeVector(std::vector& v) { + auto sum = std::accumulate(v.begin(), v.end(), 0.0); std::transform( v.begin(), v.end(), v.begin(), [sum](double v) { return v / sum; }); } @@ -92,6 +120,7 @@ std::vector computeNormalizedFitness( [](const std::unique_ptr& c) { return 1.0 / c->runtime.toMicroSeconds(); }); + sigmaScale(fitness); normalizeVector(fitness); return fitness; } From d23898e5aa698a49b9adf68b781090a819862fce Mon Sep 17 00:00:00 2001 From: Theodoros Theodoridis Date: Thu, 15 Mar 2018 10:14:40 +0100 Subject: [PATCH 2/5] [genetic search] Fix bug that disabled crossover --- tc/autotuner/genetic_search.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tc/autotuner/genetic_search.cc b/tc/autotuner/genetic_search.cc index 742da64e3..924597dd6 100644 --- a/tc/autotuner/genetic_search.cc +++ b/tc/autotuner/genetic_search.cc @@ -258,9 +258,9 @@ void GeneticSearch::breed() { auto accFitness = computeAccumulatedFitness(population); Population new_population; new_population.reserve(maxPopulationSize); - for (auto& p : population) { + for (size_t c = 0; c < numberElites; ++c) { new_population.push_back( - make_unique(p->configuration)); + make_unique(population.at(c)->configuration)); } auto select = [&]() -> const TuningConfiguration& { From cfad85e8ba0656430d3a5fa27ff3e23b45b29010 Mon Sep 17 00:00:00 2001 From: Theodoros Theodoridis Date: Thu, 15 Mar 2018 10:15:34 +0100 Subject: [PATCH 3/5] [genetic search] Switch to Stochastic Universal Sampling Stochastic Universal Sampling is an improvement upon the roulette algorithm that was previously used --- tc/autotuner/genetic_search.cc | 34 ++++++++++++++++++++++++++++------ tc/autotuner/genetic_search.h | 3 +++ 2 files changed, 31 insertions(+), 6 deletions(-) diff --git a/tc/autotuner/genetic_search.cc b/tc/autotuner/genetic_search.cc index 924597dd6..840ac4cdb 100644 --- a/tc/autotuner/genetic_search.cc +++ b/tc/autotuner/genetic_search.cc @@ -195,6 +195,7 @@ GeneticSearch::GeneticSearch( lastBestConf(confs[0]), numGenerations(numGenerations), maxPopulationSize(populationSize), + matingPoolSize(populationSize * 3), crossOverRate(crossOverRate), mutationRate(mutationRate), numberElites(numberElites), @@ -254,19 +255,40 @@ TuningConfiguration GeneticSearch::crossover( return a; } +std::vector GeneticSearch::stochasticUniversalSampling( + const std::vector& fitness) const { + std::vector matingPool; + matingPool.reserve(matingPoolSize); + + auto r = std::uniform_real_distribution(0, 1.0 / matingPoolSize)(rng); + size_t count = 0; + size_t i = 0; + while (count < matingPoolSize) { + while (r <= fitness[i]) { + matingPool.push_back(population[i]->configuration); + r += 1.0 / matingPoolSize; + ++count; + } + ++i; + } + return matingPool; +} + void GeneticSearch::breed() { - auto accFitness = computeAccumulatedFitness(population); + auto matingPool = + stochasticUniversalSampling(computeAccumulatedFitness(population)); + Population new_population; - new_population.reserve(maxPopulationSize); + new_population.reserve(matingPoolSize); for (size_t c = 0; c < numberElites; ++c) { new_population.push_back( make_unique(population.at(c)->configuration)); } - auto select = [&]() -> const TuningConfiguration& { - auto limit = std::uniform_real_distribution{}(rng); - auto lb = std::lower_bound(accFitness.begin(), accFitness.end(), limit); - return population.at(std::distance(accFitness.begin(), lb))->configuration; + auto select = [&]() -> TuningConfiguration& { + auto idx = std::uniform_int_distribution{ + size_t(0), matingPool.size() - 1}(rng); + return matingPool.at(idx); }; auto shouldCrossOver = [&]() -> bool { /* diff --git a/tc/autotuner/genetic_search.h b/tc/autotuner/genetic_search.h index 5c5f7f02d..16d29e2b8 100644 --- a/tc/autotuner/genetic_search.h +++ b/tc/autotuner/genetic_search.h @@ -79,6 +79,8 @@ class GeneticSearch { void updateParameters(); private: + std::vector stochasticUniversalSampling( + const std::vector& fitness) const; void breed(); TuningConfiguration crossover( @@ -96,6 +98,7 @@ class GeneticSearch { TuningConfiguration lastBestConf; const size_t numGenerations; const size_t maxPopulationSize; + const size_t matingPoolSize; const uint8_t crossOverRate; const uint8_t mutationRate; const size_t numberElites; From 43ac05439aebad5b0290cb7be9f97479e2bf76af Mon Sep 17 00:00:00 2001 From: nicolasvasilache Date: Sun, 10 Jun 2018 20:19:56 -0600 Subject: [PATCH 4/5] Update tuning command on FAIR cluster This is needed following the changes to the build system. --- tc/benchmarks/scripts/autotuner_parallel.sh | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tc/benchmarks/scripts/autotuner_parallel.sh b/tc/benchmarks/scripts/autotuner_parallel.sh index 07b351a2b..a76d374ba 100755 --- a/tc/benchmarks/scripts/autotuner_parallel.sh +++ b/tc/benchmarks/scripts/autotuner_parallel.sh @@ -8,7 +8,12 @@ #SBATCH --gres=gpu:2 #SBATCH --partition=priority,uninterrupted,learnfair,scavenge -export TUNER_THREADS=${TUNER_THREADS:=20} +module load cuda/9.0 +. ${HOME}/anaconda/bin/activate +conda activate tc_build +export LD_LIBRARY_PATH=${CONDA_PREFIX}/lib:${LD_LIBRARY_PATH} + +export TUNER_THREADS=${TUNER_THREADS:=8} export TUNER_DEVICES=${TUNER_DEVICES:="0,1"} export DEVICE_NAME=$(nvidia-smi -L | head -n 1 | cut -d'(' -f 1 | cut -d':' -f 2 | sed "s/ //g") From 380c0542297081a7d3a250c03ec10e21bfb7688a Mon Sep 17 00:00:00 2001 From: nicolasvasilache Date: Sun, 10 Jun 2018 21:59:21 -0600 Subject: [PATCH 5/5] Drop wavenet2 for now It is too slow to converge and does not yield perf gains --- tc/benchmarks/scripts/AUTOTUNER_COMMANDS | 4 ++-- tc/benchmarks/scripts/autotuner_parallel.sh | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/tc/benchmarks/scripts/AUTOTUNER_COMMANDS b/tc/benchmarks/scripts/AUTOTUNER_COMMANDS index 78347c632..9c4aa582b 100644 --- a/tc/benchmarks/scripts/AUTOTUNER_COMMANDS +++ b/tc/benchmarks/scripts/AUTOTUNER_COMMANDS @@ -153,8 +153,8 @@ echo CUDA_LAUNCH_BLOCKING=1 ./build/tc/benchmarks/benchmark_wavenet --gtest_filt # # # -echo CUDA_LAUNCH_BLOCKING=1 ./build/tc/benchmarks/benchmark_wavenet --gtest_filter="*.WaveNet2" --B=1 --RESIDUAL_C=32 --DILATION_C=32 --SKIP_C=256 --RECEPTIVE_FIELD=4000 --DILATION_FACTOR=1 --debug_tuner=true --dump_cuda=true --disable_version_checks=true --log_dir=${LOG_DIR} --autotune=true --tuner_gen_log_generations=true --tuner_threads=${TUNER_THREADS} --tuner_devices="${TUNER_DEVICES}" --save_tuner_proto_prefix=${LOG_DIR}/ >> ${LOG_DIR}/wavenet2_B_1_RES_32_DIL_32_SKIP_256_REC_4000_F_1.log 2>&1 && CUDA_LAUNCH_BLOCKING=1 ./build/tc/benchmarks/benchmark_wavenet --gtest_filter="*.WaveNet2" --B=1 --RESIDUAL_C=32 --DILATION_C=32 --SKIP_C=256 --RECEPTIVE_FIELD=4000 --DILATION_FACTOR=1 --debug_tuner=true --dump_cuda=true --disable_version_checks=true --log_dir=${LOG_DIR} --autotune=true --tuner_gen_log_generations=true --tuner_threads=${TUNER_THREADS} --tuner_devices="${TUNER_DEVICES}" --save_tuner_proto_prefix=${LOG_DIR}/ >> ${LOG_DIR}/wavenet2_B_1_RES_32_DIL_32_SKIP_256_REC_4000_F_1.log 2>&1 +#echo CUDA_LAUNCH_BLOCKING=1 ./build/tc/benchmarks/benchmark_wavenet --gtest_filter="*.WaveNet2" --B=1 --RESIDUAL_C=32 --DILATION_C=32 --SKIP_C=256 --RECEPTIVE_FIELD=4000 --DILATION_FACTOR=1 --debug_tuner=true --dump_cuda=true --disable_version_checks=true --log_dir=${LOG_DIR} --autotune=true --tuner_gen_log_generations=true --tuner_threads=${TUNER_THREADS} --tuner_devices="${TUNER_DEVICES}" --save_tuner_proto_prefix=${LOG_DIR}/ >> ${LOG_DIR}/wavenet2_B_1_RES_32_DIL_32_SKIP_256_REC_4000_F_1.log 2>&1 && CUDA_LAUNCH_BLOCKING=1 ./build/tc/benchmarks/benchmark_wavenet --gtest_filter="*.WaveNet2" --B=1 --RESIDUAL_C=32 --DILATION_C=32 --SKIP_C=256 --RECEPTIVE_FIELD=4000 --DILATION_FACTOR=1 --debug_tuner=true --dump_cuda=true --disable_version_checks=true --log_dir=${LOG_DIR} --autotune=true --tuner_gen_log_generations=true --tuner_threads=${TUNER_THREADS} --tuner_devices="${TUNER_DEVICES}" --save_tuner_proto_prefix=${LOG_DIR}/ >> ${LOG_DIR}/wavenet2_B_1_RES_32_DIL_32_SKIP_256_REC_4000_F_1.log 2>&1 # # # -echo CUDA_LAUNCH_BLOCKING=1 ./build/tc/benchmarks/benchmark_wavenet --gtest_filter="*.WaveNet2" --B=1 --RESIDUAL_C=32 --DILATION_C=32 --SKIP_C=256 --RECEPTIVE_FIELD=4000 --DILATION_FACTOR=32 --debug_tuner=true --dump_cuda=true --disable_version_checks=true --log_dir=${LOG_DIR} --autotune=true --tuner_gen_log_generations=true --tuner_threads=${TUNER_THREADS} --tuner_devices="${TUNER_DEVICES}" --save_tuner_proto_prefix=${LOG_DIR}/ >> ${LOG_DIR}/wavenet2_B_1_RES_32_DIL_32_SKIP_256_REC_4000_F_32.log 2>&1 && CUDA_LAUNCH_BLOCKING=1 ./build/tc/benchmarks/benchmark_wavenet --gtest_filter="*.WaveNet2" --B=1 --RESIDUAL_C=32 --DILATION_C=32 --SKIP_C=256 --RECEPTIVE_FIELD=4000 --DILATION_FACTOR=32 --debug_tuner=true --dump_cuda=true --disable_version_checks=true --log_dir=${LOG_DIR} --autotune=true --tuner_gen_log_generations=true --tuner_threads=${TUNER_THREADS} --tuner_devices="${TUNER_DEVICES}" --save_tuner_proto_prefix=${LOG_DIR}/ >> ${LOG_DIR}/wavenet2_B_1_RES_32_DIL_32_SKIP_256_REC_4000_F_32.log 2>&1 \ No newline at end of file +#echo CUDA_LAUNCH_BLOCKING=1 ./build/tc/benchmarks/benchmark_wavenet --gtest_filter="*.WaveNet2" --B=1 --RESIDUAL_C=32 --DILATION_C=32 --SKIP_C=256 --RECEPTIVE_FIELD=4000 --DILATION_FACTOR=32 --debug_tuner=true --dump_cuda=true --disable_version_checks=true --log_dir=${LOG_DIR} --autotune=true --tuner_gen_log_generations=true --tuner_threads=${TUNER_THREADS} --tuner_devices="${TUNER_DEVICES}" --save_tuner_proto_prefix=${LOG_DIR}/ >> ${LOG_DIR}/wavenet2_B_1_RES_32_DIL_32_SKIP_256_REC_4000_F_32.log 2>&1 && CUDA_LAUNCH_BLOCKING=1 ./build/tc/benchmarks/benchmark_wavenet --gtest_filter="*.WaveNet2" --B=1 --RESIDUAL_C=32 --DILATION_C=32 --SKIP_C=256 --RECEPTIVE_FIELD=4000 --DILATION_FACTOR=32 --debug_tuner=true --dump_cuda=true --disable_version_checks=true --log_dir=${LOG_DIR} --autotune=true --tuner_gen_log_generations=true --tuner_threads=${TUNER_THREADS} --tuner_devices="${TUNER_DEVICES}" --save_tuner_proto_prefix=${LOG_DIR}/ >> ${LOG_DIR}/wavenet2_B_1_RES_32_DIL_32_SKIP_256_REC_4000_F_32.log 2>&1 \ No newline at end of file diff --git a/tc/benchmarks/scripts/autotuner_parallel.sh b/tc/benchmarks/scripts/autotuner_parallel.sh index a76d374ba..0ba8e5765 100755 --- a/tc/benchmarks/scripts/autotuner_parallel.sh +++ b/tc/benchmarks/scripts/autotuner_parallel.sh @@ -28,4 +28,5 @@ cat ${TC_PREFIX}/tc/benchmarks/scripts/AUTOTUNER_COMMANDS | grep -v "\#" | head cat ${TC_PREFIX}/tc/benchmarks/scripts/AUTOTUNER_COMMANDS | grep -v "\#" | head -n ${SLURM_ARRAY_TASK_ID} | tail -n 1 | xargs -i bash -c "{}" # Run with: -# sbatch --array=1-40 -C volta ./tc/benchmarks/scripts/autotuner_parallel.sh +# export NUM_TO_RUN=$(cat ${TC_PREFIX}/tc/benchmarks/scripts/AUTOTUNER_COMMANDS | grep -v "\#" | wc -l) +# sbatch --array=1-${NUM_TO_RUN} -C volta ./tc/benchmarks/scripts/autotuner_parallel.sh