Skip to content
This repository was archived by the owner on Apr 28, 2023. It is now read-only.

Commit ac2975a

Browse files
authored
Merge pull request #437 from nicolasvasilache/pr/benchmarks-followup
Benchmarks followup
2 parents 1dff4ff + c64e75a commit ac2975a

File tree

14 files changed

+893
-157
lines changed

14 files changed

+893
-157
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,3 +24,4 @@ examples/results*
2424
*.pyc
2525
test_python/tc_test/*
2626
install
27+
tc/benchmarks/results*

tc/benchmarks/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ find_library(CUDA_CUDNN_LIBRARIES cudnn
1818
set(BENCHMARKS
1919
batchmatmul
2020
group_convolution
21+
group_normalization
2122
kronecker
2223
moments
2324
tmm

tc/benchmarks/benchmark_fixture.h

Lines changed: 18 additions & 122 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
#include "tc/core/flags.h"
3737
#include "tc/core/scope_guard.h"
3838
#include "tc/core/tensor.h"
39+
#include "tc/core/utils/time.h"
3940
#include "tc/lang/canonicalize.h"
4041

4142
#include <cublas_v2.h> // Must be the same as Caffe2
@@ -59,7 +60,6 @@ DEFINE_bool(
5960
"Test on other platforms than we claim perf results for");
6061
DEFINE_bool(autotune, false, "Enable autotuning");
6162
DEFINE_string(save_tuner_proto_prefix, "/tmp", "Enable autotuning");
62-
DEFINE_bool(validate_proto, false, "whether to load options from proto");
6363

6464
struct Benchmark : public ::testing::Test {
6565
void SetUp() {
@@ -99,15 +99,20 @@ struct Benchmark : public ::testing::Test {
9999
using CheckFunction = std::function<bool(
100100
const std::vector<at::Tensor>& inputs,
101101
const std::vector<at::Tensor>& outputs)>;
102+
using PrologueFunction = std::function<tc::ProfilingInfo()>;
102103
std::vector<at::Tensor> Check(
103104
const std::string& tc,
104105
const std::string& name,
105106
const tc::CudaMappingOptions& mappingOptions,
106107
const std::vector<at::Tensor>& inputs,
107-
CheckFunction check_fun = [](const std::vector<at::Tensor>& inputs,
108-
const std::vector<at::Tensor>& outputs) {
109-
return true;
110-
}) {
108+
CheckFunction check_fun =
109+
[](const std::vector<at::Tensor>& inputs,
110+
const std::vector<at::Tensor>& outputs) { return true; },
111+
PrologueFunction prologue_fun =
112+
[]() {
113+
return tc::ProfilingInfo{tc::Duration::zero(),
114+
tc::Duration::zero()};
115+
}) {
111116
// 1. Compile, run and check
112117
auto pExecutor =
113118
tc::aten::compile<tc::CudaBackend>(tc, name, inputs, mappingOptions);
@@ -119,24 +124,28 @@ struct Benchmark : public ::testing::Test {
119124
std::vector<at::Tensor> outputs2 =
120125
tc::aten::prepareOutputs(tc, name, inputs);
121126
RunAndReport(
122-
[&pExecutor, &inputs, &outputs2]() {
127+
[&pExecutor, &inputs, &outputs2, prologue_fun]() {
128+
prologue_fun();
123129
tc::aten::run(*pExecutor, inputs, outputs2);
124130
},
125-
[&pExecutor, &inputs, &outputs2]() {
131+
[&pExecutor, &inputs, &outputs2, prologue_fun]() {
132+
TC_CUDA_RUNTIMEAPI_ENFORCE(cudaDeviceSynchronize());
133+
auto prologueTimings = prologue_fun();
126134
TC_CUDA_RUNTIMEAPI_ENFORCE(cudaDeviceSynchronize());
127135
auto timings = tc::aten::profile(*pExecutor, inputs, outputs2);
128136
TC_CUDA_RUNTIMEAPI_ENFORCE(cudaDeviceSynchronize());
129-
return timings.kernelRuntime;
137+
return prologueTimings.kernelRuntime + timings.kernelRuntime;
130138
},
131139
"COMPILED KERNEL");
132140
// 3. Run and report total compiled time (kernel runtime + CPU overhead)
133141
RunAndReport(
134142
[&pExecutor, &inputs, &outputs2]() {
135143
tc::aten::run(*pExecutor, inputs, outputs2);
136144
},
137-
[&pExecutor, &inputs, &outputs2]() {
145+
[&pExecutor, &inputs, &outputs2, prologue_fun]() {
138146
TC_CUDA_RUNTIMEAPI_ENFORCE(cudaDeviceSynchronize());
139147
auto start(std::chrono::system_clock::now());
148+
prologue_fun();
140149
tc::aten::uncheckedRun(*pExecutor, inputs, outputs2);
141150
TC_CUDA_RUNTIMEAPI_ENFORCE(cudaDeviceSynchronize());
142151
return tc::Duration::since(start);
@@ -230,119 +239,6 @@ struct Benchmark : public ::testing::Test {
230239
std::cout << "\n---------------------------------------------------------";
231240
std::cout << "\n\n";
232241

233-
#undef GET_US
234-
}
235-
236-
// Will disappear soon
237-
public:
238-
void validateProto(
239-
std::string cacheFilename,
240-
const std::string& tc,
241-
const std::string& name,
242-
const std::vector<at::Tensor>& inputs,
243-
CheckFunction check_fun = [](const std::vector<at::Tensor>&,
244-
const std::vector<at::Tensor>&) {
245-
return true;
246-
}) {
247-
std::cout << "Validating proto from: "
248-
<< tc::makeOptionsFilename(cacheFilename) << std::endl;
249-
250-
using CudaOptionsCache =
251-
tc::autotune::Autotuner<tc::CudaBackend, tc::autotune::GeneticSearch>::
252-
OptionsCacheType;
253-
CudaOptionsCache optionsCache;
254-
optionsCache.loadCacheFromFile(cacheFilename + ".options");
255-
tc::FLAGS_tuner_gen_restore_number = 1;
256-
257-
auto mappingOptions = [&]() {
258-
auto inputDLTensors = tc::aten::makeDLConstTensors(inputs);
259-
auto outputDLTensors = tc::aten::inferOutputTensorInfo(tc, name, inputs);
260-
return optionsCache.getTopKOptions(
261-
lang::canonicalTc(tc),
262-
tc::makeTensorInfoVector(tc::extractRawPtrs(inputDLTensors)),
263-
tc::makeTensorInfoVector(tc::extractRawPtrs(outputDLTensors)),
264-
tc::CudaGPUInfo::GPUInfo().getCudaDeviceStr(),
265-
1);
266-
}();
267-
268-
CHECK_GT(mappingOptions.size(), 0)
269-
<< "No mapping options for " << tc << " in loaded cache";
270-
auto pExecutor =
271-
tc::aten::compile<tc::CudaBackend>(tc, name, inputs, mappingOptions[0]);
272-
auto outputs = tc::aten::prepareOutputs(tc, name, inputs);
273-
tc::aten::run(*pExecutor, inputs, outputs);
274-
EXPECT_TRUE(check_fun(inputs, outputs));
275-
for (size_t i = 1; i < tc::FLAGS_benchmark_warmup; ++i) {
276-
tc::aten::run(*pExecutor, inputs, outputs);
277-
}
278-
std::vector<tc::Duration> kernelTimes;
279-
kernelTimes.reserve(tc::FLAGS_benchmark_iterations);
280-
std::vector<tc::Duration> totalTimes;
281-
totalTimes.reserve(tc::FLAGS_benchmark_iterations);
282-
for (size_t i = 0; i < tc::FLAGS_benchmark_iterations; ++i) {
283-
auto timings = tc::aten::profile(*pExecutor, inputs, outputs);
284-
kernelTimes.push_back(timings.kernelRuntime);
285-
TC_CUDA_RUNTIMEAPI_ENFORCE(cudaDeviceSynchronize());
286-
auto start(std::chrono::system_clock::now());
287-
tc::aten::uncheckedRun(*pExecutor, inputs, outputs);
288-
TC_CUDA_RUNTIMEAPI_ENFORCE(cudaDeviceSynchronize());
289-
totalTimes.push_back(tc::Duration::since(start));
290-
}
291-
292-
auto p50idx = static_cast<int>(std::ceil(0.5 * kernelTimes.size()));
293-
auto p90idx = static_cast<int>(std::ceil(0.9 * kernelTimes.size()));
294-
auto p99idx = static_cast<int>(std::ceil(0.99 * kernelTimes.size()));
295-
296-
std::sort(kernelTimes.begin(), kernelTimes.end());
297-
#define GET_US(X) ((X)).toMicroSeconds()
298-
299-
std::cout << "\n---------------------------------------------------------";
300-
std::cout << "\n------------- AUTOTUNED VALIDATED KERNEL STATS ----------";
301-
std::cout << "\n------------------ " << tc::FLAGS_benchmark_iterations
302-
<< " ITERATIONS ----------------";
303-
std::cout << "\n---------------------------------------------------------";
304-
std::cout << "\n";
305-
std::cout
306-
<< "Min: " << GET_US(kernelTimes.front()) << "us, "
307-
<< "p50: "
308-
<< GET_US(kernelTimes.at(std::min(p50idx, (int)kernelTimes.size() - 1)))
309-
<< "us, "
310-
<< "p90: "
311-
<< GET_US(kernelTimes.at(std::min(p90idx, (int)kernelTimes.size() - 1)))
312-
<< "us, "
313-
<< "p99: "
314-
<< GET_US(kernelTimes.at(std::min(p99idx, (int)kernelTimes.size() - 1)))
315-
<< "us, "
316-
<< "Max: " << GET_US(kernelTimes.back()) << "us";
317-
std::cout << "\n---------------------------------------------------------";
318-
std::cout << "\n\n";
319-
320-
#undef GET_US
321-
322-
std::sort(totalTimes.begin(), totalTimes.end());
323-
#define GET_US(X) ((X)).toMicroSeconds()
324-
325-
std::cout << "\n---------------------------------------------------------";
326-
std::cout << "\n-------------- AUTOTUNED VALIDATED TOTAL STATS ----------";
327-
std::cout << "\n------------------ " << tc::FLAGS_benchmark_iterations
328-
<< " ITERATIONS ----------------";
329-
std::cout << "\n---------------------------------------------------------";
330-
std::cout << "\n";
331-
std::cout
332-
<< "Min: " << GET_US(totalTimes.front()) << "us, "
333-
<< "p50: "
334-
<< GET_US(totalTimes.at(std::min(p50idx, (int)totalTimes.size() - 1)))
335-
<< "us, "
336-
<< "p90: "
337-
<< GET_US(totalTimes.at(std::min(p90idx, (int)totalTimes.size() - 1)))
338-
<< "us, "
339-
<< "p99: "
340-
<< GET_US(totalTimes.at(std::min(p99idx, (int)totalTimes.size() - 1)))
341-
<< "us, "
342-
<< "Max: " << GET_US(totalTimes.back()) << "us";
343-
std::cout << "\n---------------------------------------------------------";
344-
std::cout << "\n\n";
345-
346242
#undef GET_US
347243
}
348244
};

0 commit comments

Comments
 (0)