Skip to content
This repository was archived by the owner on Apr 28, 2023. It is now read-only.

Commit 8631877

Browse files
author
Theodoros Theodoridis
committed
[genetic search] Switch to (mu,lamda) selection
Previously each generation had mu candidates and generated mu children which all survided. This meant that really bad candidates that were randomly generated would survive across generations. With this change, lambda (typically larger thatn mu) children are generated and the best mu survive. The previous behaviour is a special case in which lambda = mu.
1 parent 3a75966 commit 8631877

File tree

12 files changed

+185
-123
lines changed

12 files changed

+185
-123
lines changed

docs/source/framework/pytorch_integration/autotuning_layers.rst

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,6 @@ You can read about all the parameters here - :ref:`autotuner_parameters`.
5858
- :code:`threads` - set this to number of CPU cores available.
5959
- :code:`generations` - 5 to 10 generations is a good number.
6060
- :code:`pop_size` - 10 is usually reasonable. You can try 10 to 20.
61-
- :code:`number_elites` - number of candidates preserved intact between generations. `1` is usually sufficient.
6261
- :code:`min_launch_total_threads` - If you have really input small sizes, set this to `1`.
6362
- :code:`gpus`: Number of gpus to use for autotuning. Default value is "0". Set this to "0,1" if you wish to use two gpus (for example).
6463

@@ -70,15 +69,15 @@ kernel timing. You can adopt the following parameter settings as starters for au
7069
.. code::
7170
7271
settings = {
73-
"threads": 32, "generations": 2, "pop_size": 10, "number_elites": 1
72+
"threads": 32, "generations": 2, "pop_size": 10
7473
}
7574
7675
* The good defaults that run for a bit longer (in exchange for better performance):
7776

7877
.. code::
7978
8079
settings = {
81-
"threads": 32, "generations": 5, "pop_size": 10, "number_elites": 1
80+
"threads": 32, "generations": 5, "pop_size": 10
8281
}
8382
8483
@@ -87,7 +86,7 @@ kernel timing. You can adopt the following parameter settings as starters for au
8786
.. code::
8887
8988
settings = {
90-
"threads": 32, "generations": 25, "pop_size": 100, "number_elites": 10
89+
"threads": 32, "generations": 25, "pop_size": 100
9190
}
9291
9392

docs/source/tutorials/tutorial_tensordot_with_tc.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ later.
132132
You can control the amount of autotuning by changing the autotuner parameters. See
133133
:ref:`autotune_parameters` for how to change the settings.
134134

135-
For the setting ``settings={"generations": 25, "pop_size": 100, "number_elites": 10}``, we
135+
For the setting ``settings={"generations": 25, "pop_size": 100}``, we
136136
get a decent kernel performance as shown in the screenshot below (tuned on one M40 GPU):
137137

138138
.. figure:: ../_static/img/autotuning-py.jpg

tc/autotuner/autotuner-inl.h

Lines changed: 51 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -243,56 +243,58 @@ void TuningHarness<Backend>::runOneIteration(
243243
size_t iteration) {
244244
// Define tensors per device once globally
245245
auto devices = detail::parseDevices<Backend>(FLAGS_tuner_devices);
246-
CHECK(executors_.empty());
247-
CHECK(configurations_.empty());
248246
for (uint64_t step = 0; step < searchStrategy.stepsPerIteration; ++step) {
249-
auto& candidates = searchStrategy.candidatesOfStep(step);
250-
// Initialize for this round
251-
currentCompilationJob_.store(0);
252-
numEvaluations_.store(0);
253-
Printer printer(
254-
iteration,
255-
step,
256-
candidates.size(),
257-
currentCompilationJob_,
258-
numEvaluations_);
259-
auto logIterations = FLAGS_tuner_gen_log_generations;
260-
ScopeGuard sgPrinter([logIterations, &printer]() {
261-
printer.stop();
262-
if (logIterations) {
263-
printer.printAll();
264-
}
265-
});
266-
267-
// Just spawn and join new threads for each iteration
268-
std::vector<std::thread> cpuCompilationThreads;
269-
cpuCompilationThreads.reserve(FLAGS_tuner_threads);
270-
ScopeGuard sgCompilationThreads([&cpuCompilationThreads]() {
271-
for (auto& cpuCompilationThread : cpuCompilationThreads) {
272-
cpuCompilationThread.join();
273-
}
274-
});
275-
for (size_t i = 0; i < FLAGS_tuner_threads; ++i) {
276-
cpuCompilationThreads.emplace_back(
277-
[this, &candidates]() { this->doCompile(candidates); });
278-
}
247+
{
248+
CHECK(executors_.empty());
249+
CHECK(configurations_.empty());
250+
auto& candidates = searchStrategy.candidatesOfStep(step);
251+
// Initialize for this round
252+
currentCompilationJob_.store(0);
253+
numEvaluations_.store(0);
254+
Printer printer(
255+
iteration,
256+
step,
257+
candidates.size(),
258+
currentCompilationJob_,
259+
numEvaluations_);
260+
auto logIterations = FLAGS_tuner_gen_log_generations;
261+
ScopeGuard sgPrinter([logIterations, &printer]() {
262+
printer.stop();
263+
if (logIterations) {
264+
printer.printAll();
265+
}
266+
});
279267

280-
// Just spawn and join new threads for each device
281-
std::vector<std::thread> workerThreads;
282-
workerThreads.reserve(devices.size());
283-
LOG_IF(INFO, tc::FLAGS_debug_tuner)
284-
<< "Start evaluation: " << devices.size() << " " << executors_.size()
285-
<< " " << configurations_.size();
286-
ScopeGuard sgDeviceWorkerThreads([&workerThreads]() {
287-
for (auto& workerThread : workerThreads) {
288-
workerThread.join();
268+
// Just spawn and join new threads for each iteration
269+
std::vector<std::thread> cpuCompilationThreads;
270+
cpuCompilationThreads.reserve(FLAGS_tuner_threads);
271+
ScopeGuard sgCompilationThreads([&cpuCompilationThreads]() {
272+
for (auto& cpuCompilationThread : cpuCompilationThreads) {
273+
cpuCompilationThread.join();
274+
}
275+
});
276+
for (size_t i = 0; i < FLAGS_tuner_threads; ++i) {
277+
cpuCompilationThreads.emplace_back(
278+
[this, &candidates]() { this->doCompile(candidates); });
289279
}
290-
});
291-
auto populationSize = candidates.size();
292-
for (auto device : devices) {
293-
workerThreads.emplace_back([this, device, populationSize, &printer]() {
294-
this->doEvaluate(device, populationSize, printer);
280+
281+
// Just spawn and join new threads for each device
282+
std::vector<std::thread> workerThreads;
283+
workerThreads.reserve(devices.size());
284+
LOG_IF(INFO, tc::FLAGS_debug_tuner)
285+
<< "Start evaluation: " << devices.size() << " " << executors_.size()
286+
<< " " << configurations_.size();
287+
ScopeGuard sgDeviceWorkerThreads([&workerThreads]() {
288+
for (auto& workerThread : workerThreads) {
289+
workerThread.join();
290+
}
295291
});
292+
auto populationSize = candidates.size();
293+
for (auto device : devices) {
294+
workerThreads.emplace_back([this, device, populationSize, &printer]() {
295+
this->doEvaluate(device, populationSize, printer);
296+
});
297+
}
296298
}
297299
searchStrategy.finishStep(step);
298300
}
@@ -305,7 +307,6 @@ void TuningHarness<Backend>::runOneIteration(
305307
infoPrinter << bestMappingOptions();
306308
LOG_LINE_BY_LINE(INFO, ssInfo);
307309
}
308-
searchStrategy.updateParameters();
309310
}
310311
} // namespace detail
311312

@@ -462,13 +463,15 @@ Autotuner<Backend, SearchStrategy>::tune(
462463
});
463464

464465
// searchStrategy is passed to tuningHarness.run()
466+
// XXX: this not generic
465467
SearchStrategy searchStrategy(
466468
configs,
467469
FLAGS_tuner_gen_generations,
468470
FLAGS_tuner_gen_pop_size,
469471
FLAGS_tuner_gen_crossover_rate,
470472
FLAGS_tuner_gen_mutation_rate,
471-
FLAGS_tuner_gen_number_elites);
473+
FLAGS_tuner_gen_mating_pool_size,
474+
FLAGS_tuner_gen_selection_pool_size);
472475

473476
// Create a tuning harness
474477
detail::TuningHarness<Backend> tuningHarness(

tc/autotuner/genetic_search.cc

Lines changed: 87 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -33,11 +33,8 @@ void randomizeParameter(Parameter& param, RNG& rng) {
3333
param.selectOption(paramIndex);
3434
}
3535

36-
template <typename RNG>
37-
void randomizePopulation(
38-
GeneticSearch::Population::iterator begin,
39-
GeneticSearch::Population::iterator end,
40-
RNG& rng) {
36+
template <typename RNG, typename Iterator>
37+
void randomizePopulation(Iterator begin, Iterator end, RNG& rng) {
4138
for (auto candidate = begin; candidate != end; ++candidate) {
4239
auto& conf = (*candidate)->configuration;
4340
do {
@@ -160,7 +157,8 @@ void dropInvalidConfigurations(GeneticSearch::Population& population) {
160157
} // namespace
161158

162159
#define VALIDATE() \
163-
CHECK_LT(numberElites, maxPopulationSize); \
160+
CHECK_LT(maxPopulationSize, matingPoolSize); \
161+
CHECK_LT(maxPopulationSize, selectionPoolSize); \
164162
CHECK(mutationRate >= 0 and mutationRate <= 100) \
165163
<< "the mutation rate (" << mutationRate \
166164
<< ") should be in the [0,100] interval"; \
@@ -188,15 +186,16 @@ GeneticSearch::GeneticSearch(
188186
size_t populationSize,
189187
uint8_t crossOverRate,
190188
uint8_t mutationRate,
191-
size_t numberElites)
189+
size_t matingPoolSize,
190+
size_t selectionPoolSize)
192191
: population(),
193192
lastBestConf(confs[0]),
194193
numGenerations(numGenerations),
195194
maxPopulationSize(populationSize),
196-
matingPoolSize(populationSize * 3),
195+
matingPoolSize(matingPoolSize),
196+
selectionPoolSize(selectionPoolSize),
197197
crossOverRate(crossOverRate),
198198
mutationRate(mutationRate),
199-
numberElites(numberElites),
200199
rng{std::random_device{}()} {
201200
restoreRngState(rng);
202201
VALIDATE();
@@ -276,13 +275,6 @@ void GeneticSearch::breed() {
276275
auto matingPool =
277276
stochasticUniversalSampling(computeAccumulatedFitness(population));
278277

279-
Population new_population;
280-
new_population.reserve(matingPoolSize);
281-
for (size_t c = 0; c < numberElites; ++c) {
282-
new_population.push_back(
283-
make_unique<CandidateConfiguration>(population.at(c)->configuration));
284-
}
285-
286278
auto select = [&]() -> TuningConfiguration& {
287279
auto idx = std::uniform_int_distribution<size_t>{
288280
size_t(0), matingPool.size() - 1}(rng);
@@ -298,39 +290,20 @@ void GeneticSearch::breed() {
298290
return dist(rng);
299291
};
300292

301-
while (new_population.size() < maxPopulationSize) {
293+
while (selectionPool.size() < selectionPoolSize) {
302294
if (shouldCrossOver()) {
303295
auto parent1 = select();
304296
auto parent2 = select();
305297
auto parent3 = select();
306-
new_population.emplace_back(make_unique<CandidateConfiguration>(
298+
selectionPool.emplace_back(make_unique<CandidateConfiguration>(
307299
crossover(parent1, parent2, parent3)));
308300
} else {
309-
new_population.emplace_back(
310-
make_unique<CandidateConfiguration>(select()));
301+
selectionPool.emplace_back(make_unique<CandidateConfiguration>(select()));
311302
}
312303
}
313-
population = std::move(new_population);
314304
}
315305

316-
void GeneticSearch::updateParameters() {
317-
dropInvalidConfigurations(population);
318-
319-
// Sort population before taking any decision
320-
std::sort(
321-
population.begin(),
322-
population.end(),
323-
[](const std::unique_ptr<CandidateConfiguration>& a,
324-
const std::unique_ptr<CandidateConfiguration>& b) {
325-
checkRuntimeRecorded(a->runtime);
326-
checkRuntimeRecorded(b->runtime);
327-
return a->runtime < b->runtime;
328-
});
329-
330-
// Update failsafe lastBestConf
331-
lastBestConf =
332-
population.size() > 0 ? population.front()->configuration : lastBestConf;
333-
306+
bool GeneticSearch::resetPopulationIfNotEnoughCandidates() {
334307
if (population.size() < minCandidatesForBreeding) {
335308
LOG_IF(ERROR, FLAGS_debug_tuner)
336309
<< population.size() << " out of " << maxPopulationSize
@@ -341,30 +314,94 @@ void GeneticSearch::updateParameters() {
341314
"--tuner_min_launch_total_threads=1. This is mostly relevant "
342315
"when autotuning a TC operating on small tensors. The next "
343316
"generation will be randomly initialized.";
344-
population.resize(0);
345-
for (size_t i = 0; i < maxPopulationSize; ++i) {
346-
population.emplace_back(
317+
selectionPool.clear();
318+
for (size_t i = 0; i < selectionPoolSize; ++i) {
319+
selectionPool.emplace_back(
347320
make_unique<CandidateConfiguration>(lastBestConf));
348321
}
349322
// Don't lose the first one which was the best from before
350-
CHECK_LT(0u, population.size());
351-
randomizePopulation(population.begin() + 1, population.end(), rng);
352-
return;
323+
randomizePopulation(selectionPool.begin() + 1, selectionPool.end(), rng);
324+
return true;
353325
}
326+
return false;
327+
}
328+
329+
namespace {
330+
void sortByRuntime(GeneticSearch::Population& population) {
331+
std::sort(
332+
population.begin(),
333+
population.end(),
334+
[](const std::unique_ptr<CandidateConfiguration>& a,
335+
const std::unique_ptr<CandidateConfiguration>& b) {
336+
checkRuntimeRecorded(a->runtime);
337+
checkRuntimeRecorded(b->runtime);
338+
return a->runtime < b->runtime;
339+
});
340+
}
341+
} // namespace
354342

343+
void GeneticSearch::generateSelectionPool() {
344+
dropInvalidConfigurations(population);
345+
sortByRuntime(population);
346+
lastBestConf =
347+
population.size() > 0 ? population.front()->configuration : lastBestConf;
348+
if (resetPopulationIfNotEnoughCandidates()) {
349+
return;
350+
}
355351
breed();
356-
for (size_t i = numberElites; i < population.size(); ++i) {
357-
mutate(*population[i], mutationRate, mutateIterations, rng);
352+
selectionPool.clear();
353+
selectionPool.emplace_back(make_unique<CandidateConfiguration>(lastBestConf));
354+
breed();
355+
for (size_t i = 1; i < selectionPool.size(); ++i) {
356+
mutate(*selectionPool[i], mutationRate, mutateIterations, rng);
357+
}
358+
}
359+
360+
void GeneticSearch::selectSurvivors() {
361+
dropInvalidConfigurations(selectionPool);
362+
sortByRuntime(selectionPool);
363+
population.clear();
364+
std::transform(
365+
selectionPool.begin(),
366+
selectionPool.begin() + std::min(selectionPool.size(), maxPopulationSize),
367+
std::back_inserter(population),
368+
[](const std::unique_ptr<CandidateConfiguration>& c) {
369+
return make_unique<CandidateConfiguration>(c->configuration);
370+
});
371+
372+
if (selectionPool.size() < maxPopulationSize) {
373+
auto numberMissing = maxPopulationSize - selectionPool.size();
374+
375+
for (size_t i = 0; i < numberMissing; ++i) {
376+
selectionPool.emplace_back(
377+
make_unique<CandidateConfiguration>(lastBestConf));
378+
}
379+
randomizePopulation(
380+
selectionPool.rbegin(), selectionPool.rbegin() + numberMissing, rng);
358381
}
359382
}
360383

361384
GeneticSearch::Population& GeneticSearch::candidatesOfStep(uint64_t step) {
362-
if (step != 0) {
363-
throw std::invalid_argument("GeneticSearch has only one step");
385+
if (step > 1) {
386+
throw std::invalid_argument("GeneticSearch has only 2 steps.");
387+
}
388+
if (step == 0) {
389+
return population;
390+
} else {
391+
return selectionPool;
364392
}
365-
return population;
366393
}
367394

395+
void GeneticSearch::finishStep(uint64_t step) {
396+
if (step > 1) {
397+
throw std::invalid_argument("GeneticSearch has only 2 steps.");
398+
}
399+
if (step == 0) {
400+
generateSelectionPool();
401+
} else {
402+
selectSurvivors();
403+
}
404+
}
368405
} // namespace autotune
369406
} // namespace tc
370407

0 commit comments

Comments
 (0)