Skip to content
This repository was archived by the owner on Apr 28, 2023. It is now read-only.

Commit 5f00c16

Browse files
author
Theodoros Theodoridis
committed
[genetic search] Switch to (mu,lamda) selection
Previously each generation had mu candidates and generated mu children which all survided. This meant that really bad candidates that were randomly generated would survive across generations. With this change, lambda (typically larger thatn mu) children are generated and the best mu survive. The previous behaviour is a special case in which lambda = mu.
1 parent cb4e92b commit 5f00c16

File tree

16 files changed

+180
-106
lines changed

16 files changed

+180
-106
lines changed

docs/source/framework/pytorch_integration/autotuning_layers.rst

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,6 @@ You can read about all the parameters here - :ref:`autotuner_parameters`.
5858
- :code:`threads` - set this to number of CPU cores available.
5959
- :code:`generations` - 5 to 10 generations is a good number.
6060
- :code:`pop_size` - 10 is usually reasonable. You can try 10 to 20.
61-
- :code:`number_elites` - number of candidates preserved intact between generations. `1` is usually sufficient.
6261
- :code:`min_launch_total_threads` - If you have really input small sizes, set this to `1`.
6362
- :code:`gpus`: Number of gpus to use for autotuning. Default value is "0". Set this to "0,1" if you wish to use two gpus (for example).
6463

@@ -70,15 +69,15 @@ kernel timing. You can adopt the following parameter settings as starters for au
7069
.. code::
7170
7271
settings = {
73-
"threads": 32, "generations": 2, "pop_size": 10, "number_elites": 1
72+
"threads": 32, "generations": 2, "pop_size": 10
7473
}
7574
7675
* The good defaults that run for a bit longer (in exchange for better performance):
7776

7877
.. code::
7978
8079
settings = {
81-
"threads": 32, "generations": 5, "pop_size": 10, "number_elites": 1
80+
"threads": 32, "generations": 5, "pop_size": 10
8281
}
8382
8483
@@ -87,7 +86,7 @@ kernel timing. You can adopt the following parameter settings as starters for au
8786
.. code::
8887
8988
settings = {
90-
"threads": 32, "generations": 25, "pop_size": 100, "number_elites": 10
89+
"threads": 32, "generations": 25, "pop_size": 100
9190
}
9291
9392

docs/source/tutorials/tutorial_tensordot_with_tc.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ later.
132132
You can control the amount of autotuning by changing the autotuner parameters. See
133133
:ref:`autotune_parameters` for how to change the settings.
134134

135-
For the setting ``settings={"generations": 25, "pop_size": 100, "number_elites": 10}``, we
135+
For the setting ``settings={"generations": 25, "pop_size": 100}``, we
136136
get a decent kernel performance as shown in the screenshot below (tuned on one M40 GPU):
137137

138138
.. figure:: ../_static/img/autotuning-py.jpg

tc/autotuner/genetic_autotuner.cc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,8 @@ llvm::Optional<CudaMappingOptions> GeneticAutotuner::tune(
120120
FLAGS_tuner_gen_pop_size,
121121
FLAGS_tuner_gen_crossover_rate,
122122
FLAGS_tuner_gen_mutation_rate,
123-
FLAGS_tuner_gen_number_elites,
123+
FLAGS_tuner_gen_mating_pool_size,
124+
FLAGS_tuner_gen_selection_pool_size,
124125
tcNameMap_.at(tcName),
125126
tcName,
126127
inputs,

tc/autotuner/genetic_search.cc

Lines changed: 75 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,8 @@ void dropInvalidConfigurations(GeneticSearch::Population& population) {
162162
} // namespace
163163

164164
#define VALIDATE() \
165-
CHECK_LT(kNumberElites, kMaxPopulationSize); \
165+
CHECK_LT(kMaxPopulationSize, kMatingPoolSize); \
166+
CHECK_LT(kMaxPopulationSize, kSelectionPoolSize); \
166167
CHECK(kMutationRate >= 0 and kMutationRate <= 100) \
167168
<< "the mutation rate (" << kMutationRate \
168169
<< ") should be in the [0,100] interval"; \
@@ -189,14 +190,15 @@ GeneticSearch::GeneticSearch(
189190
size_t n,
190191
uint8_t crossOverRate,
191192
uint8_t mutationRate,
192-
size_t numberElites)
193+
size_t matingPoolSize,
194+
size_t selectionPoolSize)
193195
: population(),
194196
lastBestConf(confs[0]),
195197
kMaxPopulationSize(n),
196-
kMatingPoolSize(n * 3),
198+
kMatingPoolSize(matingPoolSize),
199+
kSelectionPoolSize(selectionPoolSize),
197200
kCrossOverRate(crossOverRate),
198201
kMutationRate(mutationRate),
199-
kNumberElites(numberElites),
200202
rng{std::random_device{}()} {
201203
restoreRngState(rng);
202204
VALIDATE();
@@ -222,14 +224,15 @@ GeneticSearch::GeneticSearch(
222224
size_t n,
223225
uint8_t crossOverRate,
224226
uint8_t mutationRate,
225-
size_t numberElites)
227+
size_t matingPoolSize,
228+
size_t selectionPoolSize)
226229
: population(),
227230
lastBestConf(conf),
228231
kMaxPopulationSize(n),
229-
kMatingPoolSize(n * 3),
232+
kMatingPoolSize(matingPoolSize),
233+
kSelectionPoolSize(selectionPoolSize),
230234
kCrossOverRate(crossOverRate),
231235
kMutationRate(mutationRate),
232-
kNumberElites(numberElites),
233236
rng{std::random_device{}()} {
234237
restoreRngState(rng);
235238
VALIDATE();
@@ -301,13 +304,6 @@ void GeneticSearch::breed() {
301304
auto matingPool =
302305
stochasticUniversalSampling(computeAccumulatedFitness(population));
303306

304-
Population new_population;
305-
new_population.reserve(kMatingPoolSize);
306-
for (size_t c = 0; c < kNumberElites; ++c) {
307-
new_population.push_back(
308-
make_unique<CandidateConfiguration>(population.at(c)->configuration));
309-
}
310-
311307
auto select = [&]() -> TuningConfiguration& {
312308
auto idx = std::uniform_int_distribution<size_t>{
313309
size_t(0), matingPool.size() - 1}(rng);
@@ -323,45 +319,20 @@ void GeneticSearch::breed() {
323319
return dist(rng);
324320
};
325321

326-
while (new_population.size() < kMaxPopulationSize) {
322+
while (selectionPool.size() < kSelectionPoolSize) {
327323
if (shouldCrossOver()) {
328324
auto parent1 = select();
329325
auto parent2 = select();
330326
auto parent3 = select();
331-
new_population.emplace_back(make_unique<CandidateConfiguration>(
327+
selectionPool.emplace_back(make_unique<CandidateConfiguration>(
332328
crossover(parent1, parent2, parent3)));
333329
} else {
334-
new_population.emplace_back(
335-
make_unique<CandidateConfiguration>(select()));
330+
selectionPool.emplace_back(make_unique<CandidateConfiguration>(select()));
336331
}
337332
}
338-
population = std::move(new_population);
339333
}
340334

341-
void GeneticSearch::updateParameters() {
342-
dropInvalidConfigurations(population);
343-
344-
// Sort population before taking any decision
345-
std::sort(
346-
population.begin(),
347-
population.end(),
348-
[](const std::unique_ptr<CandidateConfiguration>& a,
349-
const std::unique_ptr<CandidateConfiguration>& b) {
350-
checkRuntimeRecorded(a->runtime);
351-
checkRuntimeRecorded(b->runtime);
352-
return a->runtime < b->runtime;
353-
});
354-
355-
// Update failsafe lastBestConf
356-
lastBestConf =
357-
population.size() > 0 ? population.front()->configuration : lastBestConf;
358-
if (FLAGS_tuner_print_best) {
359-
CudaMappingOptions options(
360-
CudaMappingOptions::makeSingleThreadCudaMappingOptions());
361-
lastBestConf.applyToCudaMappingOptions(options);
362-
LOG(INFO) << "Best so far:\n" << options;
363-
}
364-
335+
void GeneticSearch::resetPopulationIfNotEnoughCandidates() {
365336
if (population.size() < kMinCandidatesForBreeding) {
366337
LOG_IF(ERROR, FLAGS_debug_tuner)
367338
<< population.size() << " out of " << kMaxPopulationSize
@@ -380,12 +351,70 @@ void GeneticSearch::updateParameters() {
380351
// Don't lose the first one which was the best from before
381352
CHECK_LT(0, population.size());
382353
randomizePopulation(population.begin() + 1, population.end(), rng);
383-
return;
384354
}
355+
}
385356

357+
namespace {
358+
void sortByRuntime(GeneticSearch::Population& population) {
359+
std::sort(
360+
population.begin(),
361+
population.end(),
362+
[](const std::unique_ptr<CandidateConfiguration>& a,
363+
const std::unique_ptr<CandidateConfiguration>& b) {
364+
checkRuntimeRecorded(a->runtime);
365+
checkRuntimeRecorded(b->runtime);
366+
return a->runtime < b->runtime;
367+
});
368+
}
369+
} // namespace
370+
371+
void GeneticSearch::updateBestCandidate(const TuningConfiguration& c) {
372+
lastBestConf = c;
373+
if (FLAGS_tuner_print_best) {
374+
CudaMappingOptions options(
375+
CudaMappingOptions::makeSingleThreadCudaMappingOptions());
376+
lastBestConf.applyToCudaMappingOptions(options);
377+
LOG(INFO) << "Best so far:\n" << options;
378+
}
379+
}
380+
381+
void GeneticSearch::generateSelectionPool() {
382+
dropInvalidConfigurations(population);
383+
sortByRuntime(population);
384+
updateBestCandidate(
385+
population.size() > 0 ? population.front()->configuration : lastBestConf);
386+
resetPopulationIfNotEnoughCandidates();
386387
breed();
387-
for (int i = kNumberElites; i < population.size(); ++i) {
388-
mutate(*population[i], kMutationRate, kMutateIterations, rng);
388+
selectionPool.clear();
389+
selectionPool.emplace_back(make_unique<CandidateConfiguration>(lastBestConf));
390+
breed();
391+
for (size_t i = 1; i < selectionPool.size(); ++i) {
392+
mutate(*selectionPool[i], kMutationRate, kMutateIterations, rng);
393+
}
394+
}
395+
396+
void GeneticSearch::selectSurvivors() {
397+
dropInvalidConfigurations(selectionPool);
398+
sortByRuntime(selectionPool);
399+
population.clear();
400+
std::transform(
401+
selectionPool.begin(),
402+
selectionPool.begin() +
403+
std::min(selectionPool.size(), kMaxPopulationSize),
404+
std::back_inserter(population),
405+
[](const std::unique_ptr<CandidateConfiguration>& c) {
406+
return make_unique<CandidateConfiguration>(c->configuration);
407+
});
408+
409+
if (selectionPool.size() < kMaxPopulationSize) {
410+
auto numberMissing = kMaxPopulationSize - selectionPool.size();
411+
412+
for (size_t i = 0; i < numberMissing; ++i) {
413+
selectionPool.emplace_back(
414+
make_unique<CandidateConfiguration>(lastBestConf));
415+
}
416+
randomizePopulation(
417+
selectionPool.end() - numberMissing, selectionPool.end(), rng);
389418
}
390419
}
391420

tc/autotuner/genetic_search.h

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,8 @@ class GeneticSearch {
7070
size_t n,
7171
uint8_t crossOverRate,
7272
uint8_t mutationRate,
73-
size_t numberElites);
73+
size_t matingPoolSize,
74+
size_t selectionPoolSize);
7475

7576
/**
7677
* confs are used to seed the first generation, the rest of the population is
@@ -92,15 +93,22 @@ class GeneticSearch {
9293
size_t n,
9394
uint8_t crossOverRate,
9495
uint8_t mutationRate,
95-
size_t numberElites);
96+
size_t matingPoolSize,
97+
size_t selectionPoolSize);
9698

97-
void updateParameters();
99+
void generateSelectionPool();
100+
void selectSurvivors();
98101

99102
private:
100103
std::vector<TuningConfiguration> stochasticUniversalSampling(
101104
const std::vector<double>& fitness) const;
105+
102106
void breed();
103107

108+
void updateBestCandidate(const TuningConfiguration& c);
109+
110+
void resetPopulationIfNotEnoughCandidates();
111+
104112
TuningConfiguration crossover(
105113
TuningConfiguration&,
106114
TuningConfiguration&,
@@ -113,12 +121,13 @@ class GeneticSearch {
113121
using Population = std::vector<std::unique_ptr<CandidateConfiguration>>;
114122

115123
Population population;
124+
Population selectionPool;
116125
TuningConfiguration lastBestConf;
117126
const size_t kMaxPopulationSize;
118127
const size_t kMatingPoolSize;
128+
const size_t kSelectionPoolSize;
119129
const uint8_t kCrossOverRate;
120130
const uint8_t kMutationRate;
121-
const size_t kNumberElites;
122131

123132
/*
124133
* c++11 seeding is (apparently) not of the highest quality:

0 commit comments

Comments
 (0)