Skip to content
This repository was archived by the owner on Apr 28, 2023. It is now read-only.

Autotuning improvements #501

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 60 additions & 9 deletions tc/autotuner/genetic_search.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@

#include "tc/autotuner/genetic_search.h"

#include <algorithm>
#include <numeric>
#include <random>
#include <sstream>

Expand Down Expand Up @@ -74,9 +76,35 @@ void mutate(
}
}

void normalizeVector(std::vector<double>& v) {
double mean(std::vector<double>& v) {
if (v.empty()) {
throw std::invalid_argument("Cannot compute the mean of an empty vector.");
}
auto sum = std::accumulate(v.begin(), v.end(), 0.0);
return sum / v.size();
}

double stdv(std::vector<double>& v, double mean) {
std::vector<double> diffs(v.size());
std::transform(v.begin(), v.end(), diffs.begin(), [mean](double val) {
return val - mean;
});

auto squareSum =
std::inner_product(diffs.begin(), diffs.end(), diffs.begin(), 0.0);
return std::sqrt(squareSum / v.size());
}

void sigmaScale(std::vector<double>& v) {
auto m = mean(v);
auto s = stdv(v, m);
std::transform(v.begin(), v.end(), v.begin(), [m, s](double val) {
return std::max(val - (m - 2 * s), 0.0);
});
}

void normalizeVector(std::vector<double>& v) {
auto sum = std::accumulate(v.begin(), v.end(), 0.0);
std::transform(
v.begin(), v.end(), v.begin(), [sum](double v) { return v / sum; });
}
Expand All @@ -92,6 +120,7 @@ std::vector<double> computeNormalizedFitness(
[](const std::unique_ptr<CandidateConfiguration>& c) {
return 1.0 / c->runtime.toMicroSeconds();
});
sigmaScale(fitness);
normalizeVector(fitness);
return fitness;
}
Expand Down Expand Up @@ -166,6 +195,7 @@ GeneticSearch::GeneticSearch(
lastBestConf(confs[0]),
numGenerations(numGenerations),
maxPopulationSize(populationSize),
matingPoolSize(populationSize * 3),

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In one of the subsequent commits I introduced a separate flag for the mating pool size (instead of hardcoding populationSize * 3).

crossOverRate(crossOverRate),
mutationRate(mutationRate),
numberElites(numberElites),
Expand Down Expand Up @@ -225,19 +255,40 @@ TuningConfiguration GeneticSearch::crossover(
return a;
}

std::vector<TuningConfiguration> GeneticSearch::stochasticUniversalSampling(
const std::vector<double>& fitness) const {
std::vector<TuningConfiguration> matingPool;
matingPool.reserve(matingPoolSize);

auto r = std::uniform_real_distribution<double>(0, 1.0 / matingPoolSize)(rng);
size_t count = 0;
size_t i = 0;
while (count < matingPoolSize) {
while (r <= fitness[i]) {
matingPool.push_back(population[i]->configuration);
r += 1.0 / matingPoolSize;
++count;
}
++i;
}
return matingPool;
}

void GeneticSearch::breed() {
auto accFitness = computeAccumulatedFitness(population);
auto matingPool =
stochasticUniversalSampling(computeAccumulatedFitness(population));

Population new_population;
new_population.reserve(maxPopulationSize);
for (auto& p : population) {
new_population.reserve(matingPoolSize);
for (size_t c = 0; c < numberElites; ++c) {
new_population.push_back(
make_unique<CandidateConfiguration>(p->configuration));
make_unique<CandidateConfiguration>(population.at(c)->configuration));
}

auto select = [&]() -> const TuningConfiguration& {
auto limit = std::uniform_real_distribution<double>{}(rng);
auto lb = std::lower_bound(accFitness.begin(), accFitness.end(), limit);
return population.at(std::distance(accFitness.begin(), lb))->configuration;
auto select = [&]() -> TuningConfiguration& {
auto idx = std::uniform_int_distribution<size_t>{
size_t(0), matingPool.size() - 1}(rng);
return matingPool.at(idx);
};
auto shouldCrossOver = [&]() -> bool {
/*
Expand Down
3 changes: 3 additions & 0 deletions tc/autotuner/genetic_search.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,8 @@ class GeneticSearch {
void updateParameters();

private:
std::vector<TuningConfiguration> stochasticUniversalSampling(
const std::vector<double>& fitness) const;
void breed();

TuningConfiguration crossover(
Expand All @@ -96,6 +98,7 @@ class GeneticSearch {
TuningConfiguration lastBestConf;
const size_t numGenerations;
const size_t maxPopulationSize;
const size_t matingPoolSize;
const uint8_t crossOverRate;
const uint8_t mutationRate;
const size_t numberElites;
Expand Down
4 changes: 2 additions & 2 deletions tc/benchmarks/scripts/AUTOTUNER_COMMANDS
Original file line number Diff line number Diff line change
Expand Up @@ -153,8 +153,8 @@ echo CUDA_LAUNCH_BLOCKING=1 ./build/tc/benchmarks/benchmark_wavenet --gtest_filt
#
#
#
echo CUDA_LAUNCH_BLOCKING=1 ./build/tc/benchmarks/benchmark_wavenet --gtest_filter="*.WaveNet2" --B=1 --RESIDUAL_C=32 --DILATION_C=32 --SKIP_C=256 --RECEPTIVE_FIELD=4000 --DILATION_FACTOR=1 --debug_tuner=true --dump_cuda=true --disable_version_checks=true --log_dir=${LOG_DIR} --autotune=true --tuner_gen_log_generations=true --tuner_threads=${TUNER_THREADS} --tuner_devices="${TUNER_DEVICES}" --save_tuner_proto_prefix=${LOG_DIR}/ >> ${LOG_DIR}/wavenet2_B_1_RES_32_DIL_32_SKIP_256_REC_4000_F_1.log 2>&1 && CUDA_LAUNCH_BLOCKING=1 ./build/tc/benchmarks/benchmark_wavenet --gtest_filter="*.WaveNet2" --B=1 --RESIDUAL_C=32 --DILATION_C=32 --SKIP_C=256 --RECEPTIVE_FIELD=4000 --DILATION_FACTOR=1 --debug_tuner=true --dump_cuda=true --disable_version_checks=true --log_dir=${LOG_DIR} --autotune=true --tuner_gen_log_generations=true --tuner_threads=${TUNER_THREADS} --tuner_devices="${TUNER_DEVICES}" --save_tuner_proto_prefix=${LOG_DIR}/ >> ${LOG_DIR}/wavenet2_B_1_RES_32_DIL_32_SKIP_256_REC_4000_F_1.log 2>&1
#echo CUDA_LAUNCH_BLOCKING=1 ./build/tc/benchmarks/benchmark_wavenet --gtest_filter="*.WaveNet2" --B=1 --RESIDUAL_C=32 --DILATION_C=32 --SKIP_C=256 --RECEPTIVE_FIELD=4000 --DILATION_FACTOR=1 --debug_tuner=true --dump_cuda=true --disable_version_checks=true --log_dir=${LOG_DIR} --autotune=true --tuner_gen_log_generations=true --tuner_threads=${TUNER_THREADS} --tuner_devices="${TUNER_DEVICES}" --save_tuner_proto_prefix=${LOG_DIR}/ >> ${LOG_DIR}/wavenet2_B_1_RES_32_DIL_32_SKIP_256_REC_4000_F_1.log 2>&1 && CUDA_LAUNCH_BLOCKING=1 ./build/tc/benchmarks/benchmark_wavenet --gtest_filter="*.WaveNet2" --B=1 --RESIDUAL_C=32 --DILATION_C=32 --SKIP_C=256 --RECEPTIVE_FIELD=4000 --DILATION_FACTOR=1 --debug_tuner=true --dump_cuda=true --disable_version_checks=true --log_dir=${LOG_DIR} --autotune=true --tuner_gen_log_generations=true --tuner_threads=${TUNER_THREADS} --tuner_devices="${TUNER_DEVICES}" --save_tuner_proto_prefix=${LOG_DIR}/ >> ${LOG_DIR}/wavenet2_B_1_RES_32_DIL_32_SKIP_256_REC_4000_F_1.log 2>&1
#
#
#
echo CUDA_LAUNCH_BLOCKING=1 ./build/tc/benchmarks/benchmark_wavenet --gtest_filter="*.WaveNet2" --B=1 --RESIDUAL_C=32 --DILATION_C=32 --SKIP_C=256 --RECEPTIVE_FIELD=4000 --DILATION_FACTOR=32 --debug_tuner=true --dump_cuda=true --disable_version_checks=true --log_dir=${LOG_DIR} --autotune=true --tuner_gen_log_generations=true --tuner_threads=${TUNER_THREADS} --tuner_devices="${TUNER_DEVICES}" --save_tuner_proto_prefix=${LOG_DIR}/ >> ${LOG_DIR}/wavenet2_B_1_RES_32_DIL_32_SKIP_256_REC_4000_F_32.log 2>&1 && CUDA_LAUNCH_BLOCKING=1 ./build/tc/benchmarks/benchmark_wavenet --gtest_filter="*.WaveNet2" --B=1 --RESIDUAL_C=32 --DILATION_C=32 --SKIP_C=256 --RECEPTIVE_FIELD=4000 --DILATION_FACTOR=32 --debug_tuner=true --dump_cuda=true --disable_version_checks=true --log_dir=${LOG_DIR} --autotune=true --tuner_gen_log_generations=true --tuner_threads=${TUNER_THREADS} --tuner_devices="${TUNER_DEVICES}" --save_tuner_proto_prefix=${LOG_DIR}/ >> ${LOG_DIR}/wavenet2_B_1_RES_32_DIL_32_SKIP_256_REC_4000_F_32.log 2>&1
#echo CUDA_LAUNCH_BLOCKING=1 ./build/tc/benchmarks/benchmark_wavenet --gtest_filter="*.WaveNet2" --B=1 --RESIDUAL_C=32 --DILATION_C=32 --SKIP_C=256 --RECEPTIVE_FIELD=4000 --DILATION_FACTOR=32 --debug_tuner=true --dump_cuda=true --disable_version_checks=true --log_dir=${LOG_DIR} --autotune=true --tuner_gen_log_generations=true --tuner_threads=${TUNER_THREADS} --tuner_devices="${TUNER_DEVICES}" --save_tuner_proto_prefix=${LOG_DIR}/ >> ${LOG_DIR}/wavenet2_B_1_RES_32_DIL_32_SKIP_256_REC_4000_F_32.log 2>&1 && CUDA_LAUNCH_BLOCKING=1 ./build/tc/benchmarks/benchmark_wavenet --gtest_filter="*.WaveNet2" --B=1 --RESIDUAL_C=32 --DILATION_C=32 --SKIP_C=256 --RECEPTIVE_FIELD=4000 --DILATION_FACTOR=32 --debug_tuner=true --dump_cuda=true --disable_version_checks=true --log_dir=${LOG_DIR} --autotune=true --tuner_gen_log_generations=true --tuner_threads=${TUNER_THREADS} --tuner_devices="${TUNER_DEVICES}" --save_tuner_proto_prefix=${LOG_DIR}/ >> ${LOG_DIR}/wavenet2_B_1_RES_32_DIL_32_SKIP_256_REC_4000_F_32.log 2>&1
10 changes: 8 additions & 2 deletions tc/benchmarks/scripts/autotuner_parallel.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,12 @@
#SBATCH --gres=gpu:2
#SBATCH --partition=priority,uninterrupted,learnfair,scavenge

export TUNER_THREADS=${TUNER_THREADS:=20}
module load cuda/9.0
. ${HOME}/anaconda/bin/activate
conda activate tc_build
export LD_LIBRARY_PATH=${CONDA_PREFIX}/lib:${LD_LIBRARY_PATH}

export TUNER_THREADS=${TUNER_THREADS:=8}
export TUNER_DEVICES=${TUNER_DEVICES:="0,1"}
export DEVICE_NAME=$(nvidia-smi -L | head -n 1 | cut -d'(' -f 1 | cut -d':' -f 2 | sed "s/ //g")

Expand All @@ -23,4 +28,5 @@ cat ${TC_PREFIX}/tc/benchmarks/scripts/AUTOTUNER_COMMANDS | grep -v "\#" | head
cat ${TC_PREFIX}/tc/benchmarks/scripts/AUTOTUNER_COMMANDS | grep -v "\#" | head -n ${SLURM_ARRAY_TASK_ID} | tail -n 1 | xargs -i bash -c "{}"

# Run with:
# sbatch --array=1-40 -C volta ./tc/benchmarks/scripts/autotuner_parallel.sh
# export NUM_TO_RUN=$(cat ${TC_PREFIX}/tc/benchmarks/scripts/AUTOTUNER_COMMANDS | grep -v "\#" | wc -l)
# sbatch --array=1-${NUM_TO_RUN} -C volta ./tc/benchmarks/scripts/autotuner_parallel.sh