@@ -59,7 +59,6 @@ DEFINE_bool(
59
59
" Test on other platforms than we claim perf results for" );
60
60
DEFINE_bool (autotune, false , " Enable autotuning" );
61
61
DEFINE_string (save_tuner_proto_prefix, " /tmp" , " Enable autotuning" );
62
- DEFINE_bool (validate_proto, false , " whether to load options from proto" );
63
62
64
63
struct Benchmark : public ::testing::Test {
65
64
void SetUp () {
@@ -230,119 +229,6 @@ struct Benchmark : public ::testing::Test {
230
229
std::cout << " \n ---------------------------------------------------------" ;
231
230
std::cout << " \n\n " ;
232
231
233
- #undef GET_US
234
- }
235
-
236
- // Will disappear soon
237
- public:
238
- void validateProto (
239
- std::string cacheFilename,
240
- const std::string& tc,
241
- const std::string& name,
242
- const std::vector<at::Tensor>& inputs,
243
- CheckFunction check_fun = [](const std::vector<at::Tensor>&,
244
- const std::vector<at::Tensor>&) {
245
- return true ;
246
- }) {
247
- std::cout << " Validating proto from: "
248
- << tc::makeOptionsFilename (cacheFilename) << std::endl;
249
-
250
- using CudaOptionsCache =
251
- tc::autotune::Autotuner<tc::CudaBackend, tc::autotune::GeneticSearch>::
252
- OptionsCacheType;
253
- CudaOptionsCache optionsCache;
254
- optionsCache.loadCacheFromFile (cacheFilename + " .options" );
255
- tc::FLAGS_tuner_gen_restore_number = 1 ;
256
-
257
- auto mappingOptions = [&]() {
258
- auto inputDLTensors = tc::aten::makeDLConstTensors (inputs);
259
- auto outputDLTensors = tc::aten::inferOutputTensorInfo (tc, name, inputs);
260
- return optionsCache.getTopKOptions (
261
- lang::canonicalTc (tc),
262
- tc::makeTensorInfoVector (tc::extractRawPtrs (inputDLTensors)),
263
- tc::makeTensorInfoVector (tc::extractRawPtrs (outputDLTensors)),
264
- tc::CudaGPUInfo::GPUInfo ().getCudaDeviceStr (),
265
- 1 );
266
- }();
267
-
268
- CHECK_GT (mappingOptions.size (), 0 )
269
- << " No mapping options for " << tc << " in loaded cache" ;
270
- auto pExecutor =
271
- tc::aten::compile<tc::CudaBackend>(tc, name, inputs, mappingOptions[0 ]);
272
- auto outputs = tc::aten::prepareOutputs (tc, name, inputs);
273
- tc::aten::run (*pExecutor, inputs, outputs);
274
- EXPECT_TRUE (check_fun (inputs, outputs));
275
- for (size_t i = 1 ; i < tc::FLAGS_benchmark_warmup; ++i) {
276
- tc::aten::run (*pExecutor, inputs, outputs);
277
- }
278
- std::vector<tc::Duration> kernelTimes;
279
- kernelTimes.reserve (tc::FLAGS_benchmark_iterations);
280
- std::vector<tc::Duration> totalTimes;
281
- totalTimes.reserve (tc::FLAGS_benchmark_iterations);
282
- for (size_t i = 0 ; i < tc::FLAGS_benchmark_iterations; ++i) {
283
- auto timings = tc::aten::profile (*pExecutor, inputs, outputs);
284
- kernelTimes.push_back (timings.kernelRuntime );
285
- TC_CUDA_RUNTIMEAPI_ENFORCE (cudaDeviceSynchronize ());
286
- auto start (std::chrono::system_clock::now ());
287
- tc::aten::uncheckedRun (*pExecutor, inputs, outputs);
288
- TC_CUDA_RUNTIMEAPI_ENFORCE (cudaDeviceSynchronize ());
289
- totalTimes.push_back (tc::Duration::since (start));
290
- }
291
-
292
- auto p50idx = static_cast <int >(std::ceil (0.5 * kernelTimes.size ()));
293
- auto p90idx = static_cast <int >(std::ceil (0.9 * kernelTimes.size ()));
294
- auto p99idx = static_cast <int >(std::ceil (0.99 * kernelTimes.size ()));
295
-
296
- std::sort (kernelTimes.begin (), kernelTimes.end ());
297
- #define GET_US (X ) ((X)).toMicroSeconds()
298
-
299
- std::cout << " \n ---------------------------------------------------------" ;
300
- std::cout << " \n ------------- AUTOTUNED VALIDATED KERNEL STATS ----------" ;
301
- std::cout << " \n ------------------ " << tc::FLAGS_benchmark_iterations
302
- << " ITERATIONS ----------------" ;
303
- std::cout << " \n ---------------------------------------------------------" ;
304
- std::cout << " \n " ;
305
- std::cout
306
- << " Min: " << GET_US (kernelTimes.front ()) << " us, "
307
- << " p50: "
308
- << GET_US (kernelTimes.at (std::min (p50idx, (int )kernelTimes.size () - 1 )))
309
- << " us, "
310
- << " p90: "
311
- << GET_US (kernelTimes.at (std::min (p90idx, (int )kernelTimes.size () - 1 )))
312
- << " us, "
313
- << " p99: "
314
- << GET_US (kernelTimes.at (std::min (p99idx, (int )kernelTimes.size () - 1 )))
315
- << " us, "
316
- << " Max: " << GET_US (kernelTimes.back ()) << " us" ;
317
- std::cout << " \n ---------------------------------------------------------" ;
318
- std::cout << " \n\n " ;
319
-
320
- #undef GET_US
321
-
322
- std::sort (totalTimes.begin (), totalTimes.end ());
323
- #define GET_US (X ) ((X)).toMicroSeconds()
324
-
325
- std::cout << " \n ---------------------------------------------------------" ;
326
- std::cout << " \n -------------- AUTOTUNED VALIDATED TOTAL STATS ----------" ;
327
- std::cout << " \n ------------------ " << tc::FLAGS_benchmark_iterations
328
- << " ITERATIONS ----------------" ;
329
- std::cout << " \n ---------------------------------------------------------" ;
330
- std::cout << " \n " ;
331
- std::cout
332
- << " Min: " << GET_US (totalTimes.front ()) << " us, "
333
- << " p50: "
334
- << GET_US (totalTimes.at (std::min (p50idx, (int )totalTimes.size () - 1 )))
335
- << " us, "
336
- << " p90: "
337
- << GET_US (totalTimes.at (std::min (p90idx, (int )totalTimes.size () - 1 )))
338
- << " us, "
339
- << " p99: "
340
- << GET_US (totalTimes.at (std::min (p99idx, (int )totalTimes.size () - 1 )))
341
- << " us, "
342
- << " Max: " << GET_US (totalTimes.back ()) << " us" ;
343
- std::cout << " \n ---------------------------------------------------------" ;
344
- std::cout << " \n\n " ;
345
-
346
232
#undef GET_US
347
233
}
348
234
};
0 commit comments