Skip to content
This repository was archived by the owner on Apr 28, 2023. It is now read-only.

Commit a202c48

Browse files
author
Theodoros Theodoridis
committed
[genetic search] Switch to (mu,lamda) selection
Previously each generation had mu candidates and generated mu children which all survided. This meant that really bad candidates that were randomly generated would survive across generations. With this change, lambda (typically larger thatn mu) children are generated and the best mu survive. The previous behaviour is a special case in which lambda = mu.
1 parent 2076892 commit a202c48

File tree

16 files changed

+174
-108
lines changed

16 files changed

+174
-108
lines changed

docs/source/framework/pytorch_integration/autotuning_layers.rst

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,6 @@ You can read about all the parameters here - :ref:`autotuner_parameters`.
5858
- :code:`threads` - set this to number of CPU cores available.
5959
- :code:`generations` - 5 to 10 generations is a good number.
6060
- :code:`pop_size` - 10 is usually reasonable. You can try 10 to 20.
61-
- :code:`number_elites` - number of candidates preserved intact between generations. `1` is usually sufficient.
6261
- :code:`min_launch_total_threads` - If you have really input small sizes, set this to `1`.
6362
- :code:`gpus`: Number of gpus to use for autotuning. Default value is "0". Set this to "0,1" if you wish to use two gpus (for example).
6463

@@ -70,15 +69,15 @@ kernel timing. You can adopt the following parameter settings as starters for au
7069
.. code::
7170
7271
settings = {
73-
"threads": 32, "generations": 2, "pop_size": 10, "number_elites": 1
72+
"threads": 32, "generations": 2, "pop_size": 10
7473
}
7574
7675
* The good defaults that run for a bit longer (in exchange for better performance):
7776

7877
.. code::
7978
8079
settings = {
81-
"threads": 32, "generations": 5, "pop_size": 10, "number_elites": 1
80+
"threads": 32, "generations": 5, "pop_size": 10
8281
}
8382
8483
@@ -87,7 +86,7 @@ kernel timing. You can adopt the following parameter settings as starters for au
8786
.. code::
8887
8988
settings = {
90-
"threads": 32, "generations": 25, "pop_size": 100, "number_elites": 10
89+
"threads": 32, "generations": 25, "pop_size": 100
9190
}
9291
9392

docs/source/tutorials/tutorial_tensordot_with_tc.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ later.
132132
You can control the amount of autotuning by changing the autotuner parameters. See
133133
:ref:`autotune_parameters` for how to change the settings.
134134

135-
For the setting ``settings={"generations": 25, "pop_size": 100, "number_elites": 10}``, we
135+
For the setting ``settings={"generations": 25, "pop_size": 100}``, we
136136
get a decent kernel performance as shown in the screenshot below (tuned on one M40 GPU):
137137

138138
.. figure:: ../_static/img/autotuning-py.jpg

examples/tensordot.cc

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@
3030

3131
#include "../test/test_harness_aten_cuda.h"
3232

33-
DEFINE_uint32(number_elites, 2, "Number of elites per generation");
3433
DEFINE_uint32(generations, 3, "Number of generations to tune for");
3534
DEFINE_uint32(pop_size, 10, "Population size to tune for");
3635
DEFINE_uint32(threads, 16, "Number of threads to tune with");
@@ -96,7 +95,6 @@ int main(int argc, char** argv) {
9695
::gflags::ParseCommandLineFlags(&argc, &argv, true);
9796
::google::InitGoogleLogging(argv[0]);
9897
setAtenSeed(tc::initRandomSeed(), at::Backend::CUDA);
99-
tc::FLAGS_tuner_gen_number_elites = FLAGS_number_elites;
10098
tc::FLAGS_tuner_gen_generations = FLAGS_generations;
10199
tc::FLAGS_tuner_gen_pop_size = FLAGS_pop_size;
102100
tc::FLAGS_tuner_threads = FLAGS_threads;

include/tc/autotuner/genetic_search.h

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,8 @@ class GeneticSearch {
7070
size_t n,
7171
uint8_t crossOverRate,
7272
uint8_t mutationRate,
73-
size_t numberElites);
73+
size_t matingPoolSize,
74+
size_t selectionPoolSize);
7475

7576
/**
7677
* confs are used to seed the first generation, the rest of the population is
@@ -92,15 +93,22 @@ class GeneticSearch {
9293
size_t n,
9394
uint8_t crossOverRate,
9495
uint8_t mutationRate,
95-
size_t numberElites);
96+
size_t matingPoolSize,
97+
size_t selectionPoolSize);
9698

97-
void updateParameters();
99+
void generateSelectionPool();
100+
void selectSurvivors();
98101

99102
private:
100103
std::vector<TuningConfiguration> stochasticUniversalSampling(
101104
const std::vector<double>& fitness) const;
105+
102106
void breed();
103107

108+
void updateBestCandidate(const TuningConfiguration& c);
109+
110+
void resetPopulationIfNotEnoughCandidates();
111+
104112
TuningConfiguration crossover(
105113
TuningConfiguration&,
106114
TuningConfiguration&,
@@ -113,12 +121,13 @@ class GeneticSearch {
113121
using Population = std::vector<std::unique_ptr<CandidateConfiguration>>;
114122

115123
Population population;
124+
Population selectionPool;
116125
TuningConfiguration lastBestConf;
117126
const size_t kMaxPopulationSize;
118127
const size_t kMatingPoolSize;
128+
const size_t kSelectionPoolSize;
119129
const uint8_t kCrossOverRate;
120130
const uint8_t kMutationRate;
121-
const size_t kNumberElites;
122131

123132
/*
124133
* c++11 seeding is (apparently) not of the highest quality:

include/tc/autotuner/genetic_tuning_harness.h

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,8 @@ class GeneticTunerHarness {
3838
size_t n,
3939
uint8_t crossoverRate,
4040
uint8_t mutationRate,
41-
size_t numberElites,
41+
size_t matingPoolSize,
42+
size_t selectionPoolSize,
4243
lang::TreeRef tc,
4344
std::string kernelName,
4445
const std::unordered_map<size_t, std::vector<const DLTensor*>>& inputs,
@@ -66,12 +67,16 @@ class GeneticTunerHarness {
6667
size_t bestTimeSoFar);
6768

6869
/// Helper function to delegate compiling on the cpu to different threads
69-
template <typename ExecutorType>
70-
void doCompile(ExecutorType& engine);
70+
template <typename ExecutorType, typename Population>
71+
void doCompile(ExecutorType& engine, Population& population);
7172

7273
/// Helper function to delegate running on the gpu to different threads
73-
template <typename ExecutorType>
74-
void doGpuWork(size_t gpu, ExecutorType& engine, Printer& printer);
74+
template <typename ExecutorType, typename Population>
75+
void doGpuWork(
76+
size_t gpu,
77+
ExecutorType& engine,
78+
Population& population,
79+
Printer& printer);
7580

7681
/// Make options from conf
7782
tc::CudaMappingOptions makeOptions(const CandidateConfiguration& conf);
@@ -90,7 +95,8 @@ class GeneticTunerHarness {
9095
const size_t kMaxPopulationSize;
9196
const uint8_t kCrossOverRate;
9297
const uint8_t kMutationRate;
93-
const size_t kNumberElites;
98+
const size_t kMatingPoolSize;
99+
const size_t kSelectionPoolSize;
94100

95101
TuningConfiguration configuration;
96102

include/tc/autotuner/utils/printer.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ namespace autotune {
3333
class Printer {
3434
public:
3535
Printer(
36-
size_t generation,
36+
std::string prefix,
3737
size_t total,
3838
const std::atomic_size_t& currentCompilationJob,
3939
const std::atomic_size_t& numEvaluations);
@@ -47,7 +47,7 @@ class Printer {
4747
private:
4848
void printLoop();
4949

50-
size_t generation_;
50+
std::string prefix_;
5151
std::vector<Duration> runtimes_;
5252
mutable std::mutex runtimesMtx_;
5353

include/tc/core/flags.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,10 +40,11 @@ DECLARE_uint32(benchmark_iterations);
4040

4141
// Used in autotuning
4242
DECLARE_uint32(tuner_gen_pop_size);
43+
DECLARE_uint32(tuner_gen_mating_pool_size);
44+
DECLARE_uint32(tuner_gen_selection_pool_size);
4345
DECLARE_uint32(tuner_gen_crossover_rate);
4446
DECLARE_uint32(tuner_gen_mutation_rate);
4547
DECLARE_uint32(tuner_gen_generations);
46-
DECLARE_uint32(tuner_gen_number_elites);
4748
DECLARE_uint32(tuner_threads);
4849
DECLARE_string(tuner_gpus);
4950
DECLARE_bool(tuner_print_best);

src/autotuner/genetic_autotuner.cc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,8 @@ llvm::Optional<CudaMappingOptions> GeneticAutotuner::tune(
119119
FLAGS_tuner_gen_pop_size,
120120
FLAGS_tuner_gen_crossover_rate,
121121
FLAGS_tuner_gen_mutation_rate,
122-
FLAGS_tuner_gen_number_elites,
122+
FLAGS_tuner_gen_mating_pool_size,
123+
FLAGS_tuner_gen_selection_pool_size,
123124
tcNameMap_.at(tcName),
124125
tcName,
125126
inputs,

src/autotuner/genetic_search.cc

Lines changed: 75 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,8 @@ void dropInvalidConfigurations(GeneticSearch::Population& population) {
162162
} // namespace
163163

164164
#define VALIDATE() \
165-
CHECK_LT(kNumberElites, kMaxPopulationSize); \
165+
CHECK_LT(kMaxPopulationSize, kMatingPoolSize); \
166+
CHECK_LT(kMaxPopulationSize, kSelectionPoolSize); \
166167
CHECK(kMutationRate >= 0 and kMutationRate <= 100) \
167168
<< "the mutation rate (" << kMutationRate \
168169
<< ") should be in the [0,100] interval"; \
@@ -189,14 +190,15 @@ GeneticSearch::GeneticSearch(
189190
size_t n,
190191
uint8_t crossOverRate,
191192
uint8_t mutationRate,
192-
size_t numberElites)
193+
size_t matingPoolSize,
194+
size_t selectionPoolSize)
193195
: population(),
194196
lastBestConf(confs[0]),
195197
kMaxPopulationSize(n),
196-
kMatingPoolSize(n * 3),
198+
kMatingPoolSize(matingPoolSize),
199+
kSelectionPoolSize(selectionPoolSize),
197200
kCrossOverRate(crossOverRate),
198201
kMutationRate(mutationRate),
199-
kNumberElites(numberElites),
200202
rng{std::random_device{}()} {
201203
restoreRngState(rng);
202204
VALIDATE();
@@ -222,14 +224,15 @@ GeneticSearch::GeneticSearch(
222224
size_t n,
223225
uint8_t crossOverRate,
224226
uint8_t mutationRate,
225-
size_t numberElites)
227+
size_t matingPoolSize,
228+
size_t selectionPoolSize)
226229
: population(),
227230
lastBestConf(conf),
228231
kMaxPopulationSize(n),
229-
kMatingPoolSize(n * 3),
232+
kMatingPoolSize(matingPoolSize),
233+
kSelectionPoolSize(selectionPoolSize),
230234
kCrossOverRate(crossOverRate),
231235
kMutationRate(mutationRate),
232-
kNumberElites(numberElites),
233236
rng{std::random_device{}()} {
234237
restoreRngState(rng);
235238
VALIDATE();
@@ -301,13 +304,6 @@ void GeneticSearch::breed() {
301304
auto matingPool =
302305
stochasticUniversalSampling(computeAccumulatedFitness(population));
303306

304-
Population new_population;
305-
new_population.reserve(kMatingPoolSize);
306-
for (size_t c = 0; c < kNumberElites; ++c) {
307-
new_population.push_back(
308-
make_unique<CandidateConfiguration>(population.at(c)->configuration));
309-
}
310-
311307
auto select = [&]() -> TuningConfiguration& {
312308
auto idx = std::uniform_int_distribution<size_t>{
313309
size_t(0), matingPool.size() - 1}(rng);
@@ -323,45 +319,20 @@ void GeneticSearch::breed() {
323319
return dist(rng);
324320
};
325321

326-
while (new_population.size() < kMaxPopulationSize) {
322+
while (selectionPool.size() < kSelectionPoolSize) {
327323
if (shouldCrossOver()) {
328324
auto parent1 = select();
329325
auto parent2 = select();
330326
auto parent3 = select();
331-
new_population.emplace_back(make_unique<CandidateConfiguration>(
327+
selectionPool.emplace_back(make_unique<CandidateConfiguration>(
332328
crossover(parent1, parent2, parent3)));
333329
} else {
334-
new_population.emplace_back(
335-
make_unique<CandidateConfiguration>(select()));
330+
selectionPool.emplace_back(make_unique<CandidateConfiguration>(select()));
336331
}
337332
}
338-
population = std::move(new_population);
339333
}
340334

341-
void GeneticSearch::updateParameters() {
342-
dropInvalidConfigurations(population);
343-
344-
// Sort population before taking any decision
345-
std::sort(
346-
population.begin(),
347-
population.end(),
348-
[](const std::unique_ptr<CandidateConfiguration>& a,
349-
const std::unique_ptr<CandidateConfiguration>& b) {
350-
checkRuntimeRecorded(a->runtime);
351-
checkRuntimeRecorded(b->runtime);
352-
return a->runtime < b->runtime;
353-
});
354-
355-
// Update failsafe lastBestConf
356-
lastBestConf =
357-
population.size() > 0 ? population.front()->configuration : lastBestConf;
358-
if (FLAGS_tuner_print_best) {
359-
CudaMappingOptions options(
360-
CudaMappingOptions::makeSingleThreadCudaMappingOptions());
361-
lastBestConf.applyToCudaMappingOptions(options);
362-
LOG(INFO) << "Best so far:\n" << options;
363-
}
364-
335+
void GeneticSearch::resetPopulationIfNotEnoughCandidates() {
365336
if (population.size() < kMinCandidatesForBreeding) {
366337
LOG_IF(ERROR, FLAGS_debug_tuner)
367338
<< population.size() << " out of " << kMaxPopulationSize
@@ -380,12 +351,70 @@ void GeneticSearch::updateParameters() {
380351
// Don't lose the first one which was the best from before
381352
CHECK_LT(0, population.size());
382353
randomizePopulation(population.begin() + 1, population.end(), rng);
383-
return;
384354
}
355+
}
385356

357+
namespace {
358+
void sortByRuntime(GeneticSearch::Population& population) {
359+
std::sort(
360+
population.begin(),
361+
population.end(),
362+
[](const std::unique_ptr<CandidateConfiguration>& a,
363+
const std::unique_ptr<CandidateConfiguration>& b) {
364+
checkRuntimeRecorded(a->runtime);
365+
checkRuntimeRecorded(b->runtime);
366+
return a->runtime < b->runtime;
367+
});
368+
}
369+
} // namespace
370+
371+
void GeneticSearch::updateBestCandidate(const TuningConfiguration& c) {
372+
lastBestConf = c;
373+
if (FLAGS_tuner_print_best) {
374+
CudaMappingOptions options(
375+
CudaMappingOptions::makeSingleThreadCudaMappingOptions());
376+
lastBestConf.applyToCudaMappingOptions(options);
377+
LOG(INFO) << "Best so far:\n" << options;
378+
}
379+
}
380+
381+
void GeneticSearch::generateSelectionPool() {
382+
dropInvalidConfigurations(population);
383+
sortByRuntime(population);
384+
updateBestCandidate(
385+
population.size() > 0 ? population.front()->configuration : lastBestConf);
386+
resetPopulationIfNotEnoughCandidates();
386387
breed();
387-
for (int i = kNumberElites; i < population.size(); ++i) {
388-
mutate(*population[i], kMutationRate, kMutateIterations, rng);
388+
selectionPool.clear();
389+
selectionPool.emplace_back(make_unique<CandidateConfiguration>(lastBestConf));
390+
breed();
391+
for (size_t i = 1; i < selectionPool.size(); ++i) {
392+
mutate(*selectionPool[i], kMutationRate, kMutateIterations, rng);
393+
}
394+
}
395+
396+
void GeneticSearch::selectSurvivors() {
397+
dropInvalidConfigurations(selectionPool);
398+
sortByRuntime(selectionPool);
399+
population.clear();
400+
std::transform(
401+
selectionPool.begin(),
402+
selectionPool.begin() +
403+
std::min(selectionPool.size(), kMaxPopulationSize),
404+
std::back_inserter(population),
405+
[](const std::unique_ptr<CandidateConfiguration>& c) {
406+
return make_unique<CandidateConfiguration>(c->configuration);
407+
});
408+
409+
if (selectionPool.size() < kMaxPopulationSize) {
410+
auto numberMissing = kMaxPopulationSize - selectionPool.size();
411+
412+
for (size_t i = 0; i < numberMissing; ++i) {
413+
selectionPool.emplace_back(
414+
make_unique<CandidateConfiguration>(lastBestConf));
415+
}
416+
randomizePopulation(
417+
selectionPool.end() - numberMissing, selectionPool.end(), rng);
389418
}
390419
}
391420

0 commit comments

Comments
 (0)