36
36
#include " tc/core/flags.h"
37
37
#include " tc/core/scope_guard.h"
38
38
#include " tc/core/tensor.h"
39
+ #include " tc/core/utils/time.h"
39
40
#include " tc/lang/canonicalize.h"
40
41
41
42
#include < cublas_v2.h> // Must be the same as Caffe2
@@ -59,7 +60,6 @@ DEFINE_bool(
59
60
" Test on other platforms than we claim perf results for" );
60
61
DEFINE_bool (autotune, false , " Enable autotuning" );
61
62
DEFINE_string (save_tuner_proto_prefix, " /tmp" , " Enable autotuning" );
62
- DEFINE_bool (validate_proto, false , " whether to load options from proto" );
63
63
64
64
struct Benchmark : public ::testing::Test {
65
65
void SetUp () {
@@ -99,15 +99,20 @@ struct Benchmark : public ::testing::Test {
99
99
using CheckFunction = std::function<bool (
100
100
const std::vector<at::Tensor>& inputs,
101
101
const std::vector<at::Tensor>& outputs)>;
102
+ using PrologueFunction = std::function<tc::ProfilingInfo()>;
102
103
std::vector<at::Tensor> Check (
103
104
const std::string& tc,
104
105
const std::string& name,
105
106
const tc::CudaMappingOptions& mappingOptions,
106
107
const std::vector<at::Tensor>& inputs,
107
- CheckFunction check_fun = [](const std::vector<at::Tensor>& inputs,
108
- const std::vector<at::Tensor>& outputs) {
109
- return true ;
110
- }) {
108
+ CheckFunction check_fun =
109
+ [](const std::vector<at::Tensor>& inputs,
110
+ const std::vector<at::Tensor>& outputs) { return true ; },
111
+ PrologueFunction prologue_fun =
112
+ []() {
113
+ return tc::ProfilingInfo{tc::Duration::zero (),
114
+ tc::Duration::zero ()};
115
+ }) {
111
116
// 1. Compile, run and check
112
117
auto pExecutor =
113
118
tc::aten::compile<tc::CudaBackend>(tc, name, inputs, mappingOptions);
@@ -119,24 +124,28 @@ struct Benchmark : public ::testing::Test {
119
124
std::vector<at::Tensor> outputs2 =
120
125
tc::aten::prepareOutputs (tc, name, inputs);
121
126
RunAndReport (
122
- [&pExecutor, &inputs, &outputs2]() {
127
+ [&pExecutor, &inputs, &outputs2, prologue_fun]() {
128
+ prologue_fun ();
123
129
tc::aten::run (*pExecutor, inputs, outputs2);
124
130
},
125
- [&pExecutor, &inputs, &outputs2]() {
131
+ [&pExecutor, &inputs, &outputs2, prologue_fun]() {
132
+ TC_CUDA_RUNTIMEAPI_ENFORCE (cudaDeviceSynchronize ());
133
+ auto prologueTimings = prologue_fun ();
126
134
TC_CUDA_RUNTIMEAPI_ENFORCE (cudaDeviceSynchronize ());
127
135
auto timings = tc::aten::profile (*pExecutor, inputs, outputs2);
128
136
TC_CUDA_RUNTIMEAPI_ENFORCE (cudaDeviceSynchronize ());
129
- return timings.kernelRuntime ;
137
+ return prologueTimings. kernelRuntime + timings.kernelRuntime ;
130
138
},
131
139
" COMPILED KERNEL" );
132
140
// 3. Run and report total compiled time (kernel runtime + CPU overhead)
133
141
RunAndReport (
134
142
[&pExecutor, &inputs, &outputs2]() {
135
143
tc::aten::run (*pExecutor, inputs, outputs2);
136
144
},
137
- [&pExecutor, &inputs, &outputs2]() {
145
+ [&pExecutor, &inputs, &outputs2, prologue_fun ]() {
138
146
TC_CUDA_RUNTIMEAPI_ENFORCE (cudaDeviceSynchronize ());
139
147
auto start (std::chrono::system_clock::now ());
148
+ prologue_fun ();
140
149
tc::aten::uncheckedRun (*pExecutor, inputs, outputs2);
141
150
TC_CUDA_RUNTIMEAPI_ENFORCE (cudaDeviceSynchronize ());
142
151
return tc::Duration::since (start);
@@ -230,119 +239,6 @@ struct Benchmark : public ::testing::Test {
230
239
std::cout << " \n ---------------------------------------------------------" ;
231
240
std::cout << " \n\n " ;
232
241
233
- #undef GET_US
234
- }
235
-
236
- // Will disappear soon
237
- public:
238
- void validateProto (
239
- std::string cacheFilename,
240
- const std::string& tc,
241
- const std::string& name,
242
- const std::vector<at::Tensor>& inputs,
243
- CheckFunction check_fun = [](const std::vector<at::Tensor>&,
244
- const std::vector<at::Tensor>&) {
245
- return true ;
246
- }) {
247
- std::cout << " Validating proto from: "
248
- << tc::makeOptionsFilename (cacheFilename) << std::endl;
249
-
250
- using CudaOptionsCache =
251
- tc::autotune::Autotuner<tc::CudaBackend, tc::autotune::GeneticSearch>::
252
- OptionsCacheType;
253
- CudaOptionsCache optionsCache;
254
- optionsCache.loadCacheFromFile (cacheFilename + " .options" );
255
- tc::FLAGS_tuner_gen_restore_number = 1 ;
256
-
257
- auto mappingOptions = [&]() {
258
- auto inputDLTensors = tc::aten::makeDLConstTensors (inputs);
259
- auto outputDLTensors = tc::aten::inferOutputTensorInfo (tc, name, inputs);
260
- return optionsCache.getTopKOptions (
261
- lang::canonicalTc (tc),
262
- tc::makeTensorInfoVector (tc::extractRawPtrs (inputDLTensors)),
263
- tc::makeTensorInfoVector (tc::extractRawPtrs (outputDLTensors)),
264
- tc::CudaGPUInfo::GPUInfo ().getCudaDeviceStr (),
265
- 1 );
266
- }();
267
-
268
- CHECK_GT (mappingOptions.size (), 0 )
269
- << " No mapping options for " << tc << " in loaded cache" ;
270
- auto pExecutor =
271
- tc::aten::compile<tc::CudaBackend>(tc, name, inputs, mappingOptions[0 ]);
272
- auto outputs = tc::aten::prepareOutputs (tc, name, inputs);
273
- tc::aten::run (*pExecutor, inputs, outputs);
274
- EXPECT_TRUE (check_fun (inputs, outputs));
275
- for (size_t i = 1 ; i < tc::FLAGS_benchmark_warmup; ++i) {
276
- tc::aten::run (*pExecutor, inputs, outputs);
277
- }
278
- std::vector<tc::Duration> kernelTimes;
279
- kernelTimes.reserve (tc::FLAGS_benchmark_iterations);
280
- std::vector<tc::Duration> totalTimes;
281
- totalTimes.reserve (tc::FLAGS_benchmark_iterations);
282
- for (size_t i = 0 ; i < tc::FLAGS_benchmark_iterations; ++i) {
283
- auto timings = tc::aten::profile (*pExecutor, inputs, outputs);
284
- kernelTimes.push_back (timings.kernelRuntime );
285
- TC_CUDA_RUNTIMEAPI_ENFORCE (cudaDeviceSynchronize ());
286
- auto start (std::chrono::system_clock::now ());
287
- tc::aten::uncheckedRun (*pExecutor, inputs, outputs);
288
- TC_CUDA_RUNTIMEAPI_ENFORCE (cudaDeviceSynchronize ());
289
- totalTimes.push_back (tc::Duration::since (start));
290
- }
291
-
292
- auto p50idx = static_cast <int >(std::ceil (0.5 * kernelTimes.size ()));
293
- auto p90idx = static_cast <int >(std::ceil (0.9 * kernelTimes.size ()));
294
- auto p99idx = static_cast <int >(std::ceil (0.99 * kernelTimes.size ()));
295
-
296
- std::sort (kernelTimes.begin (), kernelTimes.end ());
297
- #define GET_US (X ) ((X)).toMicroSeconds()
298
-
299
- std::cout << " \n ---------------------------------------------------------" ;
300
- std::cout << " \n ------------- AUTOTUNED VALIDATED KERNEL STATS ----------" ;
301
- std::cout << " \n ------------------ " << tc::FLAGS_benchmark_iterations
302
- << " ITERATIONS ----------------" ;
303
- std::cout << " \n ---------------------------------------------------------" ;
304
- std::cout << " \n " ;
305
- std::cout
306
- << " Min: " << GET_US (kernelTimes.front ()) << " us, "
307
- << " p50: "
308
- << GET_US (kernelTimes.at (std::min (p50idx, (int )kernelTimes.size () - 1 )))
309
- << " us, "
310
- << " p90: "
311
- << GET_US (kernelTimes.at (std::min (p90idx, (int )kernelTimes.size () - 1 )))
312
- << " us, "
313
- << " p99: "
314
- << GET_US (kernelTimes.at (std::min (p99idx, (int )kernelTimes.size () - 1 )))
315
- << " us, "
316
- << " Max: " << GET_US (kernelTimes.back ()) << " us" ;
317
- std::cout << " \n ---------------------------------------------------------" ;
318
- std::cout << " \n\n " ;
319
-
320
- #undef GET_US
321
-
322
- std::sort (totalTimes.begin (), totalTimes.end ());
323
- #define GET_US (X ) ((X)).toMicroSeconds()
324
-
325
- std::cout << " \n ---------------------------------------------------------" ;
326
- std::cout << " \n -------------- AUTOTUNED VALIDATED TOTAL STATS ----------" ;
327
- std::cout << " \n ------------------ " << tc::FLAGS_benchmark_iterations
328
- << " ITERATIONS ----------------" ;
329
- std::cout << " \n ---------------------------------------------------------" ;
330
- std::cout << " \n " ;
331
- std::cout
332
- << " Min: " << GET_US (totalTimes.front ()) << " us, "
333
- << " p50: "
334
- << GET_US (totalTimes.at (std::min (p50idx, (int )totalTimes.size () - 1 )))
335
- << " us, "
336
- << " p90: "
337
- << GET_US (totalTimes.at (std::min (p90idx, (int )totalTimes.size () - 1 )))
338
- << " us, "
339
- << " p99: "
340
- << GET_US (totalTimes.at (std::min (p99idx, (int )totalTimes.size () - 1 )))
341
- << " us, "
342
- << " Max: " << GET_US (totalTimes.back ()) << " us" ;
343
- std::cout << " \n ---------------------------------------------------------" ;
344
- std::cout << " \n\n " ;
345
-
346
242
#undef GET_US
347
243
}
348
244
};
0 commit comments