30
30
31
31
#include " ../test/test_harness_aten_cuda.h"
32
32
33
- DEFINE_uint32 (number_elites, 2 , " Number of elites per generation" );
34
- DEFINE_uint32 (generations, 3 , " Number of generations to tune for" );
35
- DEFINE_uint32 (pop_size, 10 , " Population size to tune for" );
36
- DEFINE_uint32 (threads, 16 , " Number of threads to tune with" );
37
- DEFINE_string (gpus, " 0" , " List of gpus to evaluate on" );
33
+ DEFINE_string (tuner_proto, " " , " Filename to load and store proto cache " );
38
34
39
35
TEST (TensorDot, SimpleAutotune) {
40
36
// 1. Define and setup the TC compilation unit with CUDA memory
@@ -57,7 +53,7 @@ def tensordot(float(N, C1, C2, H, W) I0,
57
53
auto naiveOptions = tc::CudaMappingOptions::makeNaiveCudaMappingOptions ();
58
54
tc::autotune::GeneticAutotunerATen geneticAutotuneATen (tc);
59
55
auto bestOption = geneticAutotuneATen.tune (
60
- " /tmp/save_results " , " tensordot" , {I0, I1}, naiveOptions);
56
+ FLAGS_tuner_proto , " tensordot" , {I0, I1}, naiveOptions);
61
57
62
58
// 4. Compile and run the TC with the best option.
63
59
// Outputs get allocated; could also be pre-allocated and passed.
@@ -91,15 +87,14 @@ def tensordot(float(N, C1, C2, H, W) I0,
91
87
}
92
88
}
93
89
90
+ // From root, run with:
91
+ // ./build/examples/tensordot --tuner_threads=10 --tuner_gen_pop_size=10
92
+ // --tuner_gen_generations=3 --tuner_gen_number_elites=4
93
+ // --tuner_proto="/tmp/tensordot"
94
94
int main (int argc, char ** argv) {
95
95
::testing::InitGoogleTest (&argc, argv);
96
96
::gflags::ParseCommandLineFlags (&argc, &argv, true );
97
97
::google::InitGoogleLogging (argv[0 ]);
98
98
setAtenSeed (tc::initRandomSeed (), at::Backend::CUDA);
99
- tc::FLAGS_tuner_gen_number_elites = FLAGS_number_elites;
100
- tc::FLAGS_tuner_gen_generations = FLAGS_generations;
101
- tc::FLAGS_tuner_gen_pop_size = FLAGS_pop_size;
102
- tc::FLAGS_tuner_threads = FLAGS_threads;
103
- tc::FLAGS_tuner_gpus = FLAGS_gpus;
104
99
return RUN_ALL_TESTS ();
105
100
}
0 commit comments