Skip to content
This repository was archived by the owner on Apr 28, 2023. It is now read-only.

Commit fd57aae

Browse files
[C++ API] Loosely typed Duration considered harmful
This commit roots out a cause of regression coming from mixing size_t with std::chrono and unsafely converting between durations. The manifestation of the issue was that pruning in the autotuner was not pruning much affecting overall tuning speed. There were even 2 declaration of Duration, one in parameters.h and one in time.h. This commit tightens up all the loose usages of Duration by basically making it a new type that has std::chrono::microseconds and only exposes the minimal API that we need. We could also drop chrono completely if we don't want to convert between time intervals.
1 parent 0987cf9 commit fd57aae

21 files changed

+172
-168
lines changed

tc/aten/aten_compiler-inl.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,8 +61,7 @@ ProfilingInfo profile(
6161

6262
// The total CPU overhead is the total time minus the (synchronized) kernel
6363
// runtime
64-
auto end = std::chrono::system_clock::now();
65-
Duration cpuOverhead(end - start);
64+
Duration cpuOverhead(Duration::since(start));
6665
cpuOverhead = cpuOverhead - pi.kernelRuntime;
6766
return ProfilingInfo{cpuOverhead, pi.kernelRuntime};
6867
}

tc/autotuner/autotuner-inl.h

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ TuningHarness<Backend>::TuningHarness(
4747
baseMapping_(baseMapping),
4848
inputs_(inputs),
4949
outputs_(outputs),
50-
bestTime_(std::numeric_limits<size_t>::max()),
50+
bestTime_(Duration::max()),
5151
bestMappingOptions_(baseMapping),
5252
optionsCache_(optionsCache) {}
5353

@@ -168,9 +168,9 @@ void TuningHarness<Backend>::doEvaluate(
168168
LOG_LINE_BY_LINE(INFO, ssInfo);
169169
}
170170

171-
std::vector<Duration> runtimes;
171+
std::vector<Duration> runtimes{Duration::max()};
172172
try {
173-
size_t bestTimeSoFar;
173+
Duration bestTimeSoFar(Duration::max());
174174
{
175175
std::lock_guard<std::mutex> lock(bestTimeMutex_);
176176
bestTimeSoFar = bestTime_;
@@ -187,7 +187,7 @@ void TuningHarness<Backend>::doEvaluate(
187187
runtimes.reserve(kReducedBenchmarkIterations);
188188
for (size_t i = 0; i < kReducedBenchmarkIterations; ++i) {
189189
auto timings = pExecutor->profile(inputs, outputs);
190-
if (timings.kernelRuntime.count() > 0) {
190+
if (timings.kernelRuntime.toMicroSeconds() > 0) {
191191
runtimes.push_back(timings.kernelRuntime);
192192
}
193193
}
@@ -210,10 +210,10 @@ void TuningHarness<Backend>::doEvaluate(
210210
}
211211

212212
auto prof = median(runtimes);
213-
size_t profUs = prof.count();
214213

215214
LOG_IF(INFO, tc::FLAGS_debug_tuner)
216-
<< "Run on device " << device << " took: " << profUs << "us";
215+
<< "Run on device " << device << " took: " << prof.toMicroSeconds()
216+
<< "us";
217217
printer.record(prof);
218218
pConf->runtime = prof;
219219

@@ -228,8 +228,8 @@ void TuningHarness<Backend>::doEvaluate(
228228
// Save best time under lock
229229
{
230230
std::lock_guard<std::mutex> lock(bestTimeMutex_);
231-
if (profUs < bestTime_) {
232-
bestTime_ = profUs;
231+
if (prof < bestTime_) {
232+
bestTime_ = prof;
233233
bestMappingOptions_ = options;
234234
}
235235
}

tc/autotuner/autotuner.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ class TuningHarness {
113113
std::unordered_map<size_t, std::vector<const DLTensor*>> outputs_;
114114

115115
// results
116-
size_t bestTime_;
116+
Duration bestTime_;
117117
MappingOptionsType bestMappingOptions_;
118118

119119
// backing options cache
@@ -212,7 +212,7 @@ bool skipExecutionOrWarmup(
212212
typename Backend::ExecutorType& executor,
213213
const std::vector<const DLTensor*>& outputs,
214214
const std::vector<const DLConstTensor*>& inputs,
215-
size_t bestTimeSoFar);
215+
Duration bestTimeSoFar);
216216

217217
template <typename Backend>
218218
std::vector<size_t> parseDevices(const std::string& devices);

tc/autotuner/cpu/autotuner.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ bool skipExecutionOrWarmup<CpuBackend>(
5959
typename CpuBackend::ExecutorType& executor,
6060
const std::vector<const DLTensor*>& outputs,
6161
const std::vector<const DLConstTensor*>& inputs,
62-
size_t bestTimeSoFar) {
62+
Duration bestTimeSoFar) {
6363
LOG(ERROR) << "NYI: skipExecutionOrWarmup<CpuBackend>";
6464
return false;
6565
}

tc/autotuner/cuda/autotuner.cc

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ bool skipExecutionOrWarmup<CudaBackend>(
7373
typename CudaBackend::ExecutorType& executor,
7474
const std::vector<const DLTensor*>& outputs,
7575
const std::vector<const DLConstTensor*>& inputs,
76-
size_t bestTimeSoFar) {
76+
Duration bestTimeSoFar) {
7777
// 1. Prune based on the number of threads: if you don't hit at least k warps
7878
// (default k = 8; 256 total threads, controlled by
7979
// FLAGS_tuner_min_launch_total_threads) then it's likely the kernel is not
@@ -107,9 +107,8 @@ bool skipExecutionOrWarmup<CudaBackend>(
107107
auto timings = executor.profile(inputs, outputs);
108108
// 2.a.
109109
constexpr size_t kCatastrophicPerfFactor = 100;
110-
if (bestTimeSoFar < std::numeric_limits<size_t>::max() and
111-
timings.kernelRuntime >= std::chrono::microseconds(
112-
(kCatastrophicPerfFactor * bestTimeSoFar))) {
110+
if (bestTimeSoFar < Duration::max() and
111+
timings.kernelRuntime >= bestTimeSoFar * kCatastrophicPerfFactor) {
113112
return true;
114113
}
115114
// 2.b. during autotuning we don't want to spend too much time executing,
@@ -123,9 +122,8 @@ bool skipExecutionOrWarmup<CudaBackend>(
123122
// catastrophically bad.
124123
constexpr int kEarlyPruneFactor = 5;
125124
timings = executor.profile(inputs, outputs);
126-
if (bestTimeSoFar < std::numeric_limits<size_t>::max() and
127-
timings.kernelRuntime >=
128-
std::chrono::microseconds((kEarlyPruneFactor * bestTimeSoFar))) {
125+
if (bestTimeSoFar < Duration::max() and
126+
timings.kernelRuntime >= bestTimeSoFar * kEarlyPruneFactor) {
129127
return true;
130128
}
131129

tc/autotuner/genetic_search.cc

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -88,9 +88,7 @@ std::vector<double> computeNormalizedFitness(
8888
population.end(),
8989
std::back_inserter(fitness),
9090
[](const std::unique_ptr<CandidateConfiguration>& c) {
91-
return 1.0 /
92-
std::chrono::duration_cast<std::chrono::microseconds>(c->runtime)
93-
.count();
91+
return 1.0 / c->runtime.toMicroSeconds();
9492
});
9593
normalizeVector(fitness);
9694
return fitness;

tc/autotuner/options_cache-inl.h

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -98,8 +98,7 @@ OptionsCacheValue<Backend>::toProtobuf() const {
9898
typename Backend::OptionsCacheValueProtoType buf_value;
9999
*(buf_value.mutable_kernel_options()) = mappingOptions.proto();
100100
for (auto d : runtimes) {
101-
buf_value.add_recorded_runtimes(
102-
std::chrono::duration_cast<std::chrono::microseconds>(d).count());
101+
buf_value.add_recorded_runtimes(d.toMicroSeconds());
103102
}
104103
return buf_value;
105104
}
@@ -109,7 +108,7 @@ OptionsCacheValue<Backend> OptionsCacheValue<Backend>::fromProtobuf(
109108
const typename Backend::OptionsCacheValueProtoType& proto) {
110109
std::vector<Duration> runtimes;
111110
for (auto d : proto.recorded_runtimes()) {
112-
runtimes.push_back(Duration(d));
111+
runtimes.push_back(Duration::fromMicroSeconds(d));
113112
}
114113
return OptionsCacheValue<Backend>{
115114
runtimes, typename Backend::MappingOptionsType(proto.kernel_options())};

tc/autotuner/parameters.h

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include "tc/core/cpu/cpu_mapping_options.h"
2323
#include "tc/core/cuda/cuda_mapping_options.h"
2424
#include "tc/core/utils/memory.h"
25+
#include "tc/core/utils/time.h"
2526

2627
#include <llvm/ADT/Optional.h>
2728

@@ -227,9 +228,6 @@ class TuningParameterFixer {
227228
friend class TuningConfiguration;
228229
};
229230

230-
using TimePoint = std::chrono::high_resolution_clock::time_point;
231-
using Duration = std::chrono::high_resolution_clock::duration;
232-
233231
class CandidateConfiguration {
234232
public:
235233
CandidateConfiguration(

tc/autotuner/utils.cc

Lines changed: 16 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -50,12 +50,6 @@ std::vector<std::size_t> powers2andCeilDivisors(std::size_t val) {
5050
return res;
5151
}
5252

53-
namespace {
54-
uint64_t toMicroseconds(const Duration& d) {
55-
return std::chrono::duration_cast<std::chrono::microseconds>(d).count();
56-
}
57-
} // namespace
58-
5953
void Printer::record(Duration runtime) {
6054
std::lock_guard<std::mutex> lock(runtimesMtx_);
6155
runtimes_.push_back(runtime);
@@ -75,11 +69,11 @@ void Printer::printLoop() {
7569
std::lock_guard<std::mutex> lock(runtimesMtx_);
7670
if (not runtimes_.empty()) {
7771
std::sort(runtimes_.begin(), runtimes_.end());
78-
auto best = toMicroseconds(runtimes_.front());
79-
auto median = toMicroseconds(runtimes_.at(runtimes_.size() / 2));
80-
auto worst = toMicroseconds(runtimes_.back());
81-
ss << " (best/median/worst)us: " << best << '/' << median << '/'
82-
<< worst;
72+
auto best = runtimes_.front();
73+
auto median = runtimes_.at(runtimes_.size() / 2);
74+
auto worst = runtimes_.back();
75+
ss << " (best/median/worst)us: " << best.toMicroSeconds() << '/'
76+
<< median.toMicroSeconds() << '/' << worst.toMicroSeconds();
8377
}
8478
}
8579
// XXX: platform specific erase current line and move cursor to begining
@@ -121,21 +115,21 @@ void Printer::stop() {
121115
}
122116

123117
void Printer::printAll() {
124-
auto runtimes = [this]() {
125-
std::lock_guard<std::mutex> lock(runtimesMtx_);
126-
std::sort(runtimes_.begin(), runtimes_.end());
127-
std::vector<uint64_t> runtimes;
118+
auto getSortedRuntimes = [this]() {
119+
std::vector<size_t> runtimes;
128120
runtimes.reserve(runtimes_.size());
129-
std::transform(
130-
runtimes_.begin(),
131-
runtimes_.end(),
132-
std::back_inserter(runtimes),
133-
toMicroseconds);
121+
{
122+
std::lock_guard<std::mutex> lock(runtimesMtx_);
123+
for (auto r : runtimes_) {
124+
runtimes.push_back(r.toMicroSeconds());
125+
}
126+
}
127+
std::sort(runtimes.begin(), runtimes.end());
134128
return runtimes;
135-
}();
129+
};
136130
LOG_IF(INFO, FLAGS_debug_tuner)
137131
<< "\n [TUNER][ITERATION LOG] median times of each candidate (in us) "
138-
<< runtimes << std::endl;
132+
<< getSortedRuntimes() << std::endl;
139133
}
140134
} // namespace autotune
141135
} // namespace tc

tc/benchmarks/benchmark_fixture.h

Lines changed: 14 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -123,21 +123,18 @@ struct Benchmark : public ::testing::Test {
123123
auto timings = tc::aten::profile(*pExecutor, inputs, outputs);
124124
kernelTimes.push_back(timings.kernelRuntime);
125125
TC_CUDA_RUNTIMEAPI_ENFORCE(cudaDeviceSynchronize());
126-
auto time(std::chrono::system_clock::now());
126+
auto start(std::chrono::system_clock::now());
127127
tc::aten::uncheckedRun(*pExecutor, inputs, outputs);
128128
TC_CUDA_RUNTIMEAPI_ENFORCE(cudaDeviceSynchronize());
129-
totalTimes.push_back(
130-
std::chrono::duration_cast<std::chrono::microseconds>(
131-
std::chrono::system_clock::now() - time));
129+
totalTimes.push_back(tc::Duration::since(start));
132130
}
133131

134132
auto p50idx = static_cast<int>(std::ceil(0.5 * kernelTimes.size()));
135133
auto p90idx = static_cast<int>(std::ceil(0.9 * kernelTimes.size()));
136134
auto p99idx = static_cast<int>(std::ceil(0.99 * kernelTimes.size()));
137135

138136
std::sort(kernelTimes.begin(), kernelTimes.end());
139-
#define GET_US(X) \
140-
(std::chrono::duration_cast<std::chrono::microseconds>((X)).count())
137+
#define GET_US(X) ((X)).toMicroSeconds()
141138

142139
std::cout << "\n---------------------------------------------------------";
143140
std::cout << "\n------------------ COMPILED KERNEL STATS ----------------";
@@ -163,8 +160,7 @@ struct Benchmark : public ::testing::Test {
163160
#undef GET_US
164161

165162
std::sort(totalTimes.begin(), totalTimes.end());
166-
#define GET_US(X) \
167-
(std::chrono::duration_cast<std::chrono::microseconds>((X)).count())
163+
#define GET_US(X) ((X)).toMicroSeconds()
168164

169165
std::cout << "\n---------------------------------------------------------";
170166
std::cout << "\n------------------ COMPILED TOTAL STATS ----------------";
@@ -199,19 +195,17 @@ struct Benchmark : public ::testing::Test {
199195
std::vector<tc::Duration> times;
200196
times.reserve(tc::FLAGS_benchmark_iterations);
201197
for (size_t i = 0; i < tc::FLAGS_benchmark_iterations; ++i) {
202-
auto time(std::chrono::system_clock::now());
198+
auto start(std::chrono::system_clock::now());
203199
compute(res);
204200
TC_CUDA_RUNTIMEAPI_ENFORCE(cudaDeviceSynchronize());
205-
times.push_back(std::chrono::duration_cast<std::chrono::microseconds>(
206-
std::chrono::system_clock::now() - time));
201+
times.push_back(tc::Duration::since(start));
207202
}
208203
std::sort(times.begin(), times.end());
209204
auto p50idx = static_cast<int>(std::ceil(0.5 * times.size()));
210205
auto p90idx = static_cast<int>(std::ceil(0.9 * times.size()));
211206
auto p99idx = static_cast<int>(std::ceil(0.99 * times.size()));
212207

213-
#define GET_US(X) \
214-
(std::chrono::duration_cast<std::chrono::microseconds>((X)).count())
208+
#define GET_US(X) ((X)).toMicroSeconds()
215209

216210
std::cout << "\n---------------------------------------------------------";
217211
std::cout << "\n------------------ REFERENCE IMPL. STATS ----------------";
@@ -285,21 +279,18 @@ struct Benchmark : public ::testing::Test {
285279
auto timings = tc::aten::profile(*pExecutor, inputs, outputs);
286280
kernelTimes.push_back(timings.kernelRuntime);
287281
TC_CUDA_RUNTIMEAPI_ENFORCE(cudaDeviceSynchronize());
288-
auto time(std::chrono::system_clock::now());
282+
auto start(std::chrono::system_clock::now());
289283
tc::aten::uncheckedRun(*pExecutor, inputs, outputs);
290284
TC_CUDA_RUNTIMEAPI_ENFORCE(cudaDeviceSynchronize());
291-
totalTimes.push_back(
292-
std::chrono::duration_cast<std::chrono::microseconds>(
293-
std::chrono::system_clock::now() - time));
285+
totalTimes.push_back(tc::Duration::since(start));
294286
}
295287

296288
auto p50idx = static_cast<int>(std::ceil(0.5 * kernelTimes.size()));
297289
auto p90idx = static_cast<int>(std::ceil(0.9 * kernelTimes.size()));
298290
auto p99idx = static_cast<int>(std::ceil(0.99 * kernelTimes.size()));
299291

300292
std::sort(kernelTimes.begin(), kernelTimes.end());
301-
#define GET_US(X) \
302-
(std::chrono::duration_cast<std::chrono::microseconds>((X)).count())
293+
#define GET_US(X) ((X)).toMicroSeconds()
303294

304295
std::cout << "\n---------------------------------------------------------";
305296
std::cout << "\n------------- AUTOTUNED VALIDATED KERNEL STATS ----------";
@@ -325,8 +316,7 @@ struct Benchmark : public ::testing::Test {
325316
#undef GET_US
326317

327318
std::sort(totalTimes.begin(), totalTimes.end());
328-
#define GET_US(X) \
329-
(std::chrono::duration_cast<std::chrono::microseconds>((X)).count())
319+
#define GET_US(X) ((X)).toMicroSeconds()
330320

331321
std::cout << "\n---------------------------------------------------------";
332322
std::cout << "\n-------------- AUTOTUNED VALIDATED TOTAL STATS ----------";
@@ -392,21 +382,18 @@ struct Benchmark : public ::testing::Test {
392382
auto timings = tc::aten::profile(*pExecutor, inputs, outputs);
393383
kernelTimes.push_back(timings.kernelRuntime);
394384
TC_CUDA_RUNTIMEAPI_ENFORCE(cudaDeviceSynchronize());
395-
auto time(std::chrono::system_clock::now());
385+
auto start(std::chrono::system_clock::now());
396386
tc::aten::uncheckedRun(*pExecutor, inputs, outputs);
397387
TC_CUDA_RUNTIMEAPI_ENFORCE(cudaDeviceSynchronize());
398-
totalTimes.push_back(
399-
std::chrono::duration_cast<std::chrono::microseconds>(
400-
std::chrono::system_clock::now() - time));
388+
totalTimes.push_back(tc::Duration::since(start));
401389
}
402390

403391
auto p50idx = static_cast<int>(std::ceil(0.5 * kernelTimes.size()));
404392
auto p90idx = static_cast<int>(std::ceil(0.9 * kernelTimes.size()));
405393
auto p99idx = static_cast<int>(std::ceil(0.99 * kernelTimes.size()));
406394
std::sort(kernelTimes.begin(), kernelTimes.end());
407395

408-
#define GET_US(X) \
409-
(std::chrono::duration_cast<std::chrono::microseconds>((X)).count())
396+
#define GET_US(X) ((X)).toMicroSeconds()
410397

411398
{
412399
std::ofstream out(resultsFilename);

0 commit comments

Comments
 (0)