Skip to content
This repository was archived by the owner on Apr 28, 2023. It is now read-only.

Genetic Tuning Improvements #160

Closed
wants to merge 9 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@ You can read about all the parameters here - :ref:`autotuner_parameters`.
- :code:`threads` - set this to number of CPU cores available.
- :code:`generations` - 5 to 10 generations is a good number.
- :code:`pop_size` - 10 is usually reasonable. You can try 10 to 20.
- :code:`number_elites` - number of candidates preserved intact between generations. `1` is usually sufficient.
- :code:`min_launch_total_threads` - If you have really input small sizes, set this to `1`.
- :code:`gpus`: Number of gpus to use for autotuning. Default value is "0". Set this to "0,1" if you wish to use two gpus (for example).

Expand All @@ -70,15 +69,15 @@ kernel timing. You can adopt the following parameter settings as starters for au
.. code::

settings = {
"threads": 32, "generations": 2, "pop_size": 10, "number_elites": 1
"threads": 32, "generations": 2, "pop_size": 10
}

* The good defaults that run for a bit longer (in exchange for better performance):

.. code::

settings = {
"threads": 32, "generations": 5, "pop_size": 10, "number_elites": 1
"threads": 32, "generations": 5, "pop_size": 10
}


Expand All @@ -87,7 +86,7 @@ kernel timing. You can adopt the following parameter settings as starters for au
.. code::

settings = {
"threads": 32, "generations": 25, "pop_size": 100, "number_elites": 10
"threads": 32, "generations": 25, "pop_size": 100
}


Expand Down
2 changes: 1 addition & 1 deletion docs/source/tutorials/tutorial_tensordot_with_tc.rst
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ later.
You can control the amount of autotuning by changing the autotuner parameters. See
:ref:`autotune_parameters` for how to change the settings.

For the setting ``settings={"generations": 25, "pop_size": 100, "number_elites": 10}``, we
For the setting ``settings={"generations": 25, "pop_size": 100}``, we
get a decent kernel performance as shown in the screenshot below (tuned on one M40 GPU):

.. figure:: ../_static/img/autotuning-py.jpg
Expand Down
3 changes: 2 additions & 1 deletion tc/autotuner/genetic_autotuner.cc
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,8 @@ llvm::Optional<CudaMappingOptions> GeneticAutotuner::tune(
FLAGS_tuner_gen_pop_size,
FLAGS_tuner_gen_crossover_rate,
FLAGS_tuner_gen_mutation_rate,
FLAGS_tuner_gen_number_elites,
FLAGS_tuner_gen_mating_pool_size,
FLAGS_tuner_gen_selection_pool_size,
tcNameMap_.at(tcName),
tcName,
inputs,
Expand Down
170 changes: 116 additions & 54 deletions tc/autotuner/genetic_search.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,13 @@

#include "tc/autotuner/genetic_search.h"

#include <algorithm>
#include <numeric>
#include <random>
#include <sstream>

#include "tc/autotuner/utils/utils.h"

namespace tc {
namespace autotune {

Expand Down Expand Up @@ -72,13 +76,6 @@ void mutate(
}
}

void normalizeVector(std::vector<double>& v) {
auto sum = std::accumulate(v.begin(), v.end(), 0.0);

std::transform(
v.begin(), v.end(), v.begin(), [sum](double v) { return v / sum; });
}

std::vector<double> computeNormalizedFitness(
const GeneticSearch::Population& population) {
std::vector<double> fitness;
Expand All @@ -92,6 +89,7 @@ std::vector<double> computeNormalizedFitness(
std::chrono::duration_cast<std::chrono::microseconds>(c->runtime)
.count();
});
sigmaScale(fitness);
normalizeVector(fitness);
return fitness;
}
Expand Down Expand Up @@ -133,7 +131,8 @@ void dropInvalidConfigurations(GeneticSearch::Population& population) {
} // namespace

#define VALIDATE() \
CHECK_LT(kNumberElites, kMaxPopulationSize); \
CHECK_LT(kMaxPopulationSize, kMatingPoolSize); \
CHECK_LT(kMaxPopulationSize, kSelectionPoolSize); \
CHECK(kMutationRate >= 0 and kMutationRate <= 100) \
<< "the mutation rate (" << kMutationRate \
<< ") should be in the [0,100] interval"; \
Expand All @@ -160,13 +159,15 @@ GeneticSearch::GeneticSearch(
size_t n,
uint8_t crossOverRate,
uint8_t mutationRate,
size_t numberElites)
size_t matingPoolSize,
size_t selectionPoolSize)
: population(),
lastBestConf(confs[0]),
kMaxPopulationSize(n),
kMatingPoolSize(matingPoolSize),
kSelectionPoolSize(selectionPoolSize),
kCrossOverRate(crossOverRate),
kMutationRate(mutationRate),
kNumberElites(numberElites),
rng{std::random_device{}()} {
restoreRngState(rng);
VALIDATE();
Expand All @@ -192,13 +193,15 @@ GeneticSearch::GeneticSearch(
size_t n,
uint8_t crossOverRate,
uint8_t mutationRate,
size_t numberElites)
size_t matingPoolSize,
size_t selectionPoolSize)
: population(),
lastBestConf(conf),
kMaxPopulationSize(n),
kMatingPoolSize(matingPoolSize),
kSelectionPoolSize(selectionPoolSize),
kCrossOverRate(crossOverRate),
kMutationRate(mutationRate),
kNumberElites(numberElites),
rng{std::random_device{}()} {
restoreRngState(rng);
VALIDATE();
Expand Down Expand Up @@ -246,19 +249,34 @@ TuningConfiguration GeneticSearch::crossover(
return a;
}

void GeneticSearch::breed() {
auto accFitness = computeAccumulatedFitness(population);
Population new_population;
new_population.reserve(kMaxPopulationSize);
for (auto& p : population) {
new_population.push_back(
make_unique<CandidateConfiguration>(p->configuration));
std::vector<TuningConfiguration> GeneticSearch::stochasticUniversalSampling(
const std::vector<double>& fitness) const {
std::vector<TuningConfiguration> matingPool;
matingPool.reserve(kMatingPoolSize);

auto r =
std::uniform_real_distribution<double>(0, 1.0 / kMatingPoolSize)(rng);
size_t count = 0;
size_t i = 0;
while (count < kMatingPoolSize) {
while (r <= fitness[i]) {
matingPool.push_back(population[i]->configuration);
r += 1.0 / kMatingPoolSize;
++count;
}
++i;
}
return matingPool;
}

void GeneticSearch::breed() {
auto matingPool =
stochasticUniversalSampling(computeAccumulatedFitness(population));

auto select = [&]() -> TuningConfiguration& {
auto limit = std::uniform_real_distribution<double>{}(rng);
auto lb = std::lower_bound(accFitness.begin(), accFitness.end(), limit);
return population.at(std::distance(accFitness.begin(), lb))->configuration;
auto idx = std::uniform_int_distribution<size_t>{
size_t(0), matingPool.size() - 1}(rng);
return matingPool.at(idx);
};
auto shouldCrossOver = [&]() -> bool {
/*
Expand All @@ -270,45 +288,20 @@ void GeneticSearch::breed() {
return dist(rng);
};

while (new_population.size() < kMaxPopulationSize) {
while (selectionPool.size() < kSelectionPoolSize) {
if (shouldCrossOver()) {
auto parent1 = select();
auto parent2 = select();
auto parent3 = select();
new_population.emplace_back(make_unique<CandidateConfiguration>(
selectionPool.emplace_back(make_unique<CandidateConfiguration>(
crossover(parent1, parent2, parent3)));
} else {
new_population.emplace_back(
make_unique<CandidateConfiguration>(select()));
selectionPool.emplace_back(make_unique<CandidateConfiguration>(select()));
}
}
population = std::move(new_population);
}

void GeneticSearch::updateParameters() {
dropInvalidConfigurations(population);

// Sort population before taking any decision
std::sort(
population.begin(),
population.end(),
[](const std::unique_ptr<CandidateConfiguration>& a,
const std::unique_ptr<CandidateConfiguration>& b) {
checkRuntimeRecorded(a->runtime);
checkRuntimeRecorded(b->runtime);
return a->runtime < b->runtime;
});

// Update failsafe lastBestConf
lastBestConf =
population.size() > 0 ? population.front()->configuration : lastBestConf;
if (FLAGS_tuner_print_best) {
CudaMappingOptions options(
CudaMappingOptions::makeSingleThreadCudaMappingOptions());
lastBestConf.applyToCudaMappingOptions(options);
LOG(INFO) << "Best so far:\n" << options;
}

bool GeneticSearch::resetPopulationIfNotEnoughCandidates() {
if (population.size() < kMinCandidatesForBreeding) {
LOG_IF(ERROR, FLAGS_debug_tuner)
<< population.size() << " out of " << kMaxPopulationSize
Expand All @@ -327,12 +320,81 @@ void GeneticSearch::updateParameters() {
// Don't lose the first one which was the best from before
CHECK_LT(0, population.size());
randomizePopulation(population.begin() + 1, population.end(), rng);
return;

selectionPool.clear();
for (size_t i = 0; i < kSelectionPoolSize; ++i) {
selectionPool.emplace_back(
make_unique<CandidateConfiguration>(lastBestConf));
}
randomizePopulation(selectionPool.begin() + 1, selectionPool.end(), rng);
return true;
}
return false;
}

namespace {
void sortByRuntime(GeneticSearch::Population& population) {
std::sort(
population.begin(),
population.end(),
[](const std::unique_ptr<CandidateConfiguration>& a,
const std::unique_ptr<CandidateConfiguration>& b) {
checkRuntimeRecorded(a->runtime);
checkRuntimeRecorded(b->runtime);
return a->runtime < b->runtime;
});
}
} // namespace

void GeneticSearch::updateBestCandidate(const TuningConfiguration& c) {
lastBestConf = c;
if (FLAGS_tuner_print_best) {
CudaMappingOptions options(
CudaMappingOptions::makeSingleThreadCudaMappingOptions());
lastBestConf.applyToCudaMappingOptions(options);
LOG(INFO) << "Best so far:\n" << options;
}
}

void GeneticSearch::generateSelectionPool() {
dropInvalidConfigurations(population);
sortByRuntime(population);
updateBestCandidate(
population.size() > 0 ? population.front()->configuration : lastBestConf);
if (resetPopulationIfNotEnoughCandidates()) {
return;
}
selectionPool.clear();
selectionPool.emplace_back(make_unique<CandidateConfiguration>(lastBestConf));
breed();
for (int i = kNumberElites; i < population.size(); ++i) {
mutate(*population[i], kMutationRate, kMutateIterations, rng);
for (size_t i = 1; i < selectionPool.size(); ++i) {
mutate(*selectionPool[i], kMutationRate, kMutateIterations, rng);
}
}

void GeneticSearch::selectSurvivors() {
dropInvalidConfigurations(selectionPool);
sortByRuntime(selectionPool);
population.clear();
std::transform(
selectionPool.begin(),
selectionPool.begin() +
std::min(selectionPool.size(), kMaxPopulationSize),
std::back_inserter(population),
[](const std::unique_ptr<CandidateConfiguration>& c) {
CHECK(c);
return make_unique<CandidateConfiguration>(*c);
});

if (selectionPool.size() < kMaxPopulationSize) {
auto numberMissing = kMaxPopulationSize - selectionPool.size();

for (size_t i = 0; i < numberMissing; ++i) {
selectionPool.emplace_back(
make_unique<CandidateConfiguration>(lastBestConf));
}
randomizePopulation(
selectionPool.end() - numberMissing, selectionPool.end(), rng);
}
}

Expand Down
20 changes: 16 additions & 4 deletions tc/autotuner/genetic_search.h
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,8 @@ class GeneticSearch {
size_t n,
uint8_t crossOverRate,
uint8_t mutationRate,
size_t numberElites);
size_t matingPoolSize,
size_t selectionPoolSize);

/**
* confs are used to seed the first generation, the rest of the population is
Expand All @@ -92,13 +93,22 @@ class GeneticSearch {
size_t n,
uint8_t crossOverRate,
uint8_t mutationRate,
size_t numberElites);
size_t matingPoolSize,
size_t selectionPoolSize);

void updateParameters();
void generateSelectionPool();
void selectSurvivors();

private:
std::vector<TuningConfiguration> stochasticUniversalSampling(
const std::vector<double>& fitness) const;

void breed();

void updateBestCandidate(const TuningConfiguration& c);

bool resetPopulationIfNotEnoughCandidates();

TuningConfiguration crossover(
TuningConfiguration&,
TuningConfiguration&,
Expand All @@ -111,11 +121,13 @@ class GeneticSearch {
using Population = std::vector<std::unique_ptr<CandidateConfiguration>>;

Population population;
Population selectionPool;
TuningConfiguration lastBestConf;
const size_t kMaxPopulationSize;
const size_t kMatingPoolSize;
const size_t kSelectionPoolSize;
const uint8_t kCrossOverRate;
const uint8_t kMutationRate;
const size_t kNumberElites;

/*
* c++11 seeding is (apparently) not of the highest quality:
Expand Down
Loading