Skip to content
This repository was archived by the owner on Apr 28, 2023. It is now read-only.

Commit 7f3c7fb

Browse files
nicolasvasilacheftynse
authored andcommitted
Add a prologue profile and uncheckedRun to benchmark checks
This allows properly timing and testing a layer written as multiple TCs. In particular this will be used to implement the group normalization benchmark in the following commit. Group normalization currently performs better as 2 successive kernels. The first kernel computes the moments and is tuned separately via tc/benchmarks/moments.cc. The best options object is then taken and injected in the prologue functions so we can properly compare group normalization.
1 parent ba7a596 commit 7f3c7fb

File tree

1 file changed

+18
-8
lines changed

1 file changed

+18
-8
lines changed

tc/benchmarks/benchmark_fixture.h

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
#include "tc/core/flags.h"
3737
#include "tc/core/scope_guard.h"
3838
#include "tc/core/tensor.h"
39+
#include "tc/core/utils/time.h"
3940
#include "tc/lang/canonicalize.h"
4041

4142
#include <cublas_v2.h> // Must be the same as Caffe2
@@ -98,15 +99,20 @@ struct Benchmark : public ::testing::Test {
9899
using CheckFunction = std::function<bool(
99100
const std::vector<at::Tensor>& inputs,
100101
const std::vector<at::Tensor>& outputs)>;
102+
using PrologueFunction = std::function<tc::ProfilingInfo()>;
101103
std::vector<at::Tensor> Check(
102104
const std::string& tc,
103105
const std::string& name,
104106
const tc::CudaMappingOptions& mappingOptions,
105107
const std::vector<at::Tensor>& inputs,
106-
CheckFunction check_fun = [](const std::vector<at::Tensor>& inputs,
107-
const std::vector<at::Tensor>& outputs) {
108-
return true;
109-
}) {
108+
CheckFunction check_fun =
109+
[](const std::vector<at::Tensor>& inputs,
110+
const std::vector<at::Tensor>& outputs) { return true; },
111+
PrologueFunction prologue_fun =
112+
[]() {
113+
return tc::ProfilingInfo{tc::Duration::zero(),
114+
tc::Duration::zero()};
115+
}) {
110116
// 1. Compile, run and check
111117
auto pExecutor =
112118
tc::aten::compile<tc::CudaBackend>(tc, name, inputs, mappingOptions);
@@ -118,24 +124,28 @@ struct Benchmark : public ::testing::Test {
118124
std::vector<at::Tensor> outputs2 =
119125
tc::aten::prepareOutputs(tc, name, inputs);
120126
RunAndReport(
121-
[&pExecutor, &inputs, &outputs2]() {
127+
[&pExecutor, &inputs, &outputs2, prologue_fun]() {
128+
prologue_fun();
122129
tc::aten::run(*pExecutor, inputs, outputs2);
123130
},
124-
[&pExecutor, &inputs, &outputs2]() {
131+
[&pExecutor, &inputs, &outputs2, prologue_fun]() {
132+
TC_CUDA_RUNTIMEAPI_ENFORCE(cudaDeviceSynchronize());
133+
auto prologueTimings = prologue_fun();
125134
TC_CUDA_RUNTIMEAPI_ENFORCE(cudaDeviceSynchronize());
126135
auto timings = tc::aten::profile(*pExecutor, inputs, outputs2);
127136
TC_CUDA_RUNTIMEAPI_ENFORCE(cudaDeviceSynchronize());
128-
return timings.kernelRuntime;
137+
return prologueTimings.kernelRuntime + timings.kernelRuntime;
129138
},
130139
"COMPILED KERNEL");
131140
// 3. Run and report total compiled time (kernel runtime + CPU overhead)
132141
RunAndReport(
133142
[&pExecutor, &inputs, &outputs2]() {
134143
tc::aten::run(*pExecutor, inputs, outputs2);
135144
},
136-
[&pExecutor, &inputs, &outputs2]() {
145+
[&pExecutor, &inputs, &outputs2, prologue_fun]() {
137146
TC_CUDA_RUNTIMEAPI_ENFORCE(cudaDeviceSynchronize());
138147
auto start(std::chrono::system_clock::now());
148+
prologue_fun();
139149
tc::aten::uncheckedRun(*pExecutor, inputs, outputs2);
140150
TC_CUDA_RUNTIMEAPI_ENFORCE(cudaDeviceSynchronize());
141151
return tc::Duration::since(start);

0 commit comments

Comments
 (0)