diff --git a/tc/autotuner/genetic_search.cc b/tc/autotuner/genetic_search.cc index b004b4916..840ac4cdb 100644 --- a/tc/autotuner/genetic_search.cc +++ b/tc/autotuner/genetic_search.cc @@ -16,6 +16,8 @@ #include "tc/autotuner/genetic_search.h" +#include +#include #include #include @@ -74,9 +76,35 @@ void mutate( } } -void normalizeVector(std::vector& v) { +double mean(std::vector& v) { + if (v.empty()) { + throw std::invalid_argument("Cannot compute the mean of an empty vector."); + } auto sum = std::accumulate(v.begin(), v.end(), 0.0); + return sum / v.size(); +} + +double stdv(std::vector& v, double mean) { + std::vector diffs(v.size()); + std::transform(v.begin(), v.end(), diffs.begin(), [mean](double val) { + return val - mean; + }); + + auto squareSum = + std::inner_product(diffs.begin(), diffs.end(), diffs.begin(), 0.0); + return std::sqrt(squareSum / v.size()); +} + +void sigmaScale(std::vector& v) { + auto m = mean(v); + auto s = stdv(v, m); + std::transform(v.begin(), v.end(), v.begin(), [m, s](double val) { + return std::max(val - (m - 2 * s), 0.0); + }); +} +void normalizeVector(std::vector& v) { + auto sum = std::accumulate(v.begin(), v.end(), 0.0); std::transform( v.begin(), v.end(), v.begin(), [sum](double v) { return v / sum; }); } @@ -92,6 +120,7 @@ std::vector computeNormalizedFitness( [](const std::unique_ptr& c) { return 1.0 / c->runtime.toMicroSeconds(); }); + sigmaScale(fitness); normalizeVector(fitness); return fitness; } @@ -166,6 +195,7 @@ GeneticSearch::GeneticSearch( lastBestConf(confs[0]), numGenerations(numGenerations), maxPopulationSize(populationSize), + matingPoolSize(populationSize * 3), crossOverRate(crossOverRate), mutationRate(mutationRate), numberElites(numberElites), @@ -225,19 +255,40 @@ TuningConfiguration GeneticSearch::crossover( return a; } +std::vector GeneticSearch::stochasticUniversalSampling( + const std::vector& fitness) const { + std::vector matingPool; + matingPool.reserve(matingPoolSize); + + auto r = std::uniform_real_distribution(0, 1.0 / matingPoolSize)(rng); + size_t count = 0; + size_t i = 0; + while (count < matingPoolSize) { + while (r <= fitness[i]) { + matingPool.push_back(population[i]->configuration); + r += 1.0 / matingPoolSize; + ++count; + } + ++i; + } + return matingPool; +} + void GeneticSearch::breed() { - auto accFitness = computeAccumulatedFitness(population); + auto matingPool = + stochasticUniversalSampling(computeAccumulatedFitness(population)); + Population new_population; - new_population.reserve(maxPopulationSize); - for (auto& p : population) { + new_population.reserve(matingPoolSize); + for (size_t c = 0; c < numberElites; ++c) { new_population.push_back( - make_unique(p->configuration)); + make_unique(population.at(c)->configuration)); } - auto select = [&]() -> const TuningConfiguration& { - auto limit = std::uniform_real_distribution{}(rng); - auto lb = std::lower_bound(accFitness.begin(), accFitness.end(), limit); - return population.at(std::distance(accFitness.begin(), lb))->configuration; + auto select = [&]() -> TuningConfiguration& { + auto idx = std::uniform_int_distribution{ + size_t(0), matingPool.size() - 1}(rng); + return matingPool.at(idx); }; auto shouldCrossOver = [&]() -> bool { /* diff --git a/tc/autotuner/genetic_search.h b/tc/autotuner/genetic_search.h index 5c5f7f02d..16d29e2b8 100644 --- a/tc/autotuner/genetic_search.h +++ b/tc/autotuner/genetic_search.h @@ -79,6 +79,8 @@ class GeneticSearch { void updateParameters(); private: + std::vector stochasticUniversalSampling( + const std::vector& fitness) const; void breed(); TuningConfiguration crossover( @@ -96,6 +98,7 @@ class GeneticSearch { TuningConfiguration lastBestConf; const size_t numGenerations; const size_t maxPopulationSize; + const size_t matingPoolSize; const uint8_t crossOverRate; const uint8_t mutationRate; const size_t numberElites; diff --git a/tc/benchmarks/scripts/AUTOTUNER_COMMANDS b/tc/benchmarks/scripts/AUTOTUNER_COMMANDS index 78347c632..9c4aa582b 100644 --- a/tc/benchmarks/scripts/AUTOTUNER_COMMANDS +++ b/tc/benchmarks/scripts/AUTOTUNER_COMMANDS @@ -153,8 +153,8 @@ echo CUDA_LAUNCH_BLOCKING=1 ./build/tc/benchmarks/benchmark_wavenet --gtest_filt # # # -echo CUDA_LAUNCH_BLOCKING=1 ./build/tc/benchmarks/benchmark_wavenet --gtest_filter="*.WaveNet2" --B=1 --RESIDUAL_C=32 --DILATION_C=32 --SKIP_C=256 --RECEPTIVE_FIELD=4000 --DILATION_FACTOR=1 --debug_tuner=true --dump_cuda=true --disable_version_checks=true --log_dir=${LOG_DIR} --autotune=true --tuner_gen_log_generations=true --tuner_threads=${TUNER_THREADS} --tuner_devices="${TUNER_DEVICES}" --save_tuner_proto_prefix=${LOG_DIR}/ >> ${LOG_DIR}/wavenet2_B_1_RES_32_DIL_32_SKIP_256_REC_4000_F_1.log 2>&1 && CUDA_LAUNCH_BLOCKING=1 ./build/tc/benchmarks/benchmark_wavenet --gtest_filter="*.WaveNet2" --B=1 --RESIDUAL_C=32 --DILATION_C=32 --SKIP_C=256 --RECEPTIVE_FIELD=4000 --DILATION_FACTOR=1 --debug_tuner=true --dump_cuda=true --disable_version_checks=true --log_dir=${LOG_DIR} --autotune=true --tuner_gen_log_generations=true --tuner_threads=${TUNER_THREADS} --tuner_devices="${TUNER_DEVICES}" --save_tuner_proto_prefix=${LOG_DIR}/ >> ${LOG_DIR}/wavenet2_B_1_RES_32_DIL_32_SKIP_256_REC_4000_F_1.log 2>&1 +#echo CUDA_LAUNCH_BLOCKING=1 ./build/tc/benchmarks/benchmark_wavenet --gtest_filter="*.WaveNet2" --B=1 --RESIDUAL_C=32 --DILATION_C=32 --SKIP_C=256 --RECEPTIVE_FIELD=4000 --DILATION_FACTOR=1 --debug_tuner=true --dump_cuda=true --disable_version_checks=true --log_dir=${LOG_DIR} --autotune=true --tuner_gen_log_generations=true --tuner_threads=${TUNER_THREADS} --tuner_devices="${TUNER_DEVICES}" --save_tuner_proto_prefix=${LOG_DIR}/ >> ${LOG_DIR}/wavenet2_B_1_RES_32_DIL_32_SKIP_256_REC_4000_F_1.log 2>&1 && CUDA_LAUNCH_BLOCKING=1 ./build/tc/benchmarks/benchmark_wavenet --gtest_filter="*.WaveNet2" --B=1 --RESIDUAL_C=32 --DILATION_C=32 --SKIP_C=256 --RECEPTIVE_FIELD=4000 --DILATION_FACTOR=1 --debug_tuner=true --dump_cuda=true --disable_version_checks=true --log_dir=${LOG_DIR} --autotune=true --tuner_gen_log_generations=true --tuner_threads=${TUNER_THREADS} --tuner_devices="${TUNER_DEVICES}" --save_tuner_proto_prefix=${LOG_DIR}/ >> ${LOG_DIR}/wavenet2_B_1_RES_32_DIL_32_SKIP_256_REC_4000_F_1.log 2>&1 # # # -echo CUDA_LAUNCH_BLOCKING=1 ./build/tc/benchmarks/benchmark_wavenet --gtest_filter="*.WaveNet2" --B=1 --RESIDUAL_C=32 --DILATION_C=32 --SKIP_C=256 --RECEPTIVE_FIELD=4000 --DILATION_FACTOR=32 --debug_tuner=true --dump_cuda=true --disable_version_checks=true --log_dir=${LOG_DIR} --autotune=true --tuner_gen_log_generations=true --tuner_threads=${TUNER_THREADS} --tuner_devices="${TUNER_DEVICES}" --save_tuner_proto_prefix=${LOG_DIR}/ >> ${LOG_DIR}/wavenet2_B_1_RES_32_DIL_32_SKIP_256_REC_4000_F_32.log 2>&1 && CUDA_LAUNCH_BLOCKING=1 ./build/tc/benchmarks/benchmark_wavenet --gtest_filter="*.WaveNet2" --B=1 --RESIDUAL_C=32 --DILATION_C=32 --SKIP_C=256 --RECEPTIVE_FIELD=4000 --DILATION_FACTOR=32 --debug_tuner=true --dump_cuda=true --disable_version_checks=true --log_dir=${LOG_DIR} --autotune=true --tuner_gen_log_generations=true --tuner_threads=${TUNER_THREADS} --tuner_devices="${TUNER_DEVICES}" --save_tuner_proto_prefix=${LOG_DIR}/ >> ${LOG_DIR}/wavenet2_B_1_RES_32_DIL_32_SKIP_256_REC_4000_F_32.log 2>&1 \ No newline at end of file +#echo CUDA_LAUNCH_BLOCKING=1 ./build/tc/benchmarks/benchmark_wavenet --gtest_filter="*.WaveNet2" --B=1 --RESIDUAL_C=32 --DILATION_C=32 --SKIP_C=256 --RECEPTIVE_FIELD=4000 --DILATION_FACTOR=32 --debug_tuner=true --dump_cuda=true --disable_version_checks=true --log_dir=${LOG_DIR} --autotune=true --tuner_gen_log_generations=true --tuner_threads=${TUNER_THREADS} --tuner_devices="${TUNER_DEVICES}" --save_tuner_proto_prefix=${LOG_DIR}/ >> ${LOG_DIR}/wavenet2_B_1_RES_32_DIL_32_SKIP_256_REC_4000_F_32.log 2>&1 && CUDA_LAUNCH_BLOCKING=1 ./build/tc/benchmarks/benchmark_wavenet --gtest_filter="*.WaveNet2" --B=1 --RESIDUAL_C=32 --DILATION_C=32 --SKIP_C=256 --RECEPTIVE_FIELD=4000 --DILATION_FACTOR=32 --debug_tuner=true --dump_cuda=true --disable_version_checks=true --log_dir=${LOG_DIR} --autotune=true --tuner_gen_log_generations=true --tuner_threads=${TUNER_THREADS} --tuner_devices="${TUNER_DEVICES}" --save_tuner_proto_prefix=${LOG_DIR}/ >> ${LOG_DIR}/wavenet2_B_1_RES_32_DIL_32_SKIP_256_REC_4000_F_32.log 2>&1 \ No newline at end of file diff --git a/tc/benchmarks/scripts/autotuner_parallel.sh b/tc/benchmarks/scripts/autotuner_parallel.sh index 07b351a2b..0ba8e5765 100755 --- a/tc/benchmarks/scripts/autotuner_parallel.sh +++ b/tc/benchmarks/scripts/autotuner_parallel.sh @@ -8,7 +8,12 @@ #SBATCH --gres=gpu:2 #SBATCH --partition=priority,uninterrupted,learnfair,scavenge -export TUNER_THREADS=${TUNER_THREADS:=20} +module load cuda/9.0 +. ${HOME}/anaconda/bin/activate +conda activate tc_build +export LD_LIBRARY_PATH=${CONDA_PREFIX}/lib:${LD_LIBRARY_PATH} + +export TUNER_THREADS=${TUNER_THREADS:=8} export TUNER_DEVICES=${TUNER_DEVICES:="0,1"} export DEVICE_NAME=$(nvidia-smi -L | head -n 1 | cut -d'(' -f 1 | cut -d':' -f 2 | sed "s/ //g") @@ -23,4 +28,5 @@ cat ${TC_PREFIX}/tc/benchmarks/scripts/AUTOTUNER_COMMANDS | grep -v "\#" | head cat ${TC_PREFIX}/tc/benchmarks/scripts/AUTOTUNER_COMMANDS | grep -v "\#" | head -n ${SLURM_ARRAY_TASK_ID} | tail -n 1 | xargs -i bash -c "{}" # Run with: -# sbatch --array=1-40 -C volta ./tc/benchmarks/scripts/autotuner_parallel.sh +# export NUM_TO_RUN=$(cat ${TC_PREFIX}/tc/benchmarks/scripts/AUTOTUNER_COMMANDS | grep -v "\#" | wc -l) +# sbatch --array=1-${NUM_TO_RUN} -C volta ./tc/benchmarks/scripts/autotuner_parallel.sh