[C++ API] Loosely typed Duration considered harmful

nicolasvasilache · nicolasvasilache · commit fd57aae51003 · 2018-05-04T13:00:01.000-06:00
This commit roots out a cause of regression coming from mixing size_t
with std::chrono and unsafely converting between durations.

The manifestation of the issue was that pruning in the autotuner was not
pruning much affecting overall tuning speed.

There were even 2 declaration of Duration, one in parameters.h and
one in time.h.
This commit tightens up all the loose usages of Duration by basically making it
a new type that has std::chrono::microseconds and only exposes the
minimal API that we need.

We could also drop chrono completely if we don't want to convert between
time intervals.
diff --git a/tc/aten/aten_compiler-inl.h b/tc/aten/aten_compiler-inl.h
@@ -61,8 +61,7 @@ ProfilingInfo profile(
 
   // The total CPU overhead is the total time minus the (synchronized) kernel
   // runtime
-  auto end = std::chrono::system_clock::now();
-  Duration cpuOverhead(end - start);
+  Duration cpuOverhead(Duration::since(start));
   cpuOverhead = cpuOverhead - pi.kernelRuntime;
   return ProfilingInfo{cpuOverhead, pi.kernelRuntime};
 }
diff --git a/tc/autotuner/autotuner-inl.h b/tc/autotuner/autotuner-inl.h
@@ -47,7 +47,7 @@ TuningHarness<Backend>::TuningHarness(
       baseMapping_(baseMapping),
       inputs_(inputs),
       outputs_(outputs),
-      bestTime_(std::numeric_limits<size_t>::max()),
+      bestTime_(Duration::max()),
       bestMappingOptions_(baseMapping),
       optionsCache_(optionsCache) {}
 
@@ -168,9 +168,9 @@ void TuningHarness<Backend>::doEvaluate(
       LOG_LINE_BY_LINE(INFO, ssInfo);
     }
 
-    std::vector<Duration> runtimes;
+    std::vector<Duration> runtimes{Duration::max()};
     try {
-      size_t bestTimeSoFar;
+      Duration bestTimeSoFar(Duration::max());
       {
         std::lock_guard<std::mutex> lock(bestTimeMutex_);
         bestTimeSoFar = bestTime_;
@@ -187,7 +187,7 @@ void TuningHarness<Backend>::doEvaluate(
         runtimes.reserve(kReducedBenchmarkIterations);
         for (size_t i = 0; i < kReducedBenchmarkIterations; ++i) {
           auto timings = pExecutor->profile(inputs, outputs);
-          if (timings.kernelRuntime.count() > 0) {
+          if (timings.kernelRuntime.toMicroSeconds() > 0) {
             runtimes.push_back(timings.kernelRuntime);
           }
         }
@@ -210,10 +210,10 @@ void TuningHarness<Backend>::doEvaluate(
     }
 
     auto prof = median(runtimes);
-    size_t profUs = prof.count();
 
     LOG_IF(INFO, tc::FLAGS_debug_tuner)
-        << "Run on device " << device << " took: " << profUs << "us";
+        << "Run on device " << device << " took: " << prof.toMicroSeconds()
+        << "us";
     printer.record(prof);
     pConf->runtime = prof;
 
@@ -228,8 +228,8 @@ void TuningHarness<Backend>::doEvaluate(
     // Save best time under lock
     {
       std::lock_guard<std::mutex> lock(bestTimeMutex_);
-      if (profUs < bestTime_) {
-        bestTime_ = profUs;
+      if (prof < bestTime_) {
+        bestTime_ = prof;
         bestMappingOptions_ = options;
       }
     }
diff --git a/tc/autotuner/autotuner.h b/tc/autotuner/autotuner.h
@@ -113,7 +113,7 @@ class TuningHarness {
   std::unordered_map<size_t, std::vector<const DLTensor*>> outputs_;
 
   // results
-  size_t bestTime_;
+  Duration bestTime_;
   MappingOptionsType bestMappingOptions_;
 
   // backing options cache
@@ -212,7 +212,7 @@ bool skipExecutionOrWarmup(
     typename Backend::ExecutorType& executor,
     const std::vector<const DLTensor*>& outputs,
     const std::vector<const DLConstTensor*>& inputs,
-    size_t bestTimeSoFar);
+    Duration bestTimeSoFar);
 
 template <typename Backend>
 std::vector<size_t> parseDevices(const std::string& devices);
diff --git a/tc/autotuner/cpu/autotuner.cc b/tc/autotuner/cpu/autotuner.cc
@@ -59,7 +59,7 @@ bool skipExecutionOrWarmup<CpuBackend>(
     typename CpuBackend::ExecutorType& executor,
     const std::vector<const DLTensor*>& outputs,
     const std::vector<const DLConstTensor*>& inputs,
-    size_t bestTimeSoFar) {
+    Duration bestTimeSoFar) {
   LOG(ERROR) << "NYI: skipExecutionOrWarmup<CpuBackend>";
   return false;
 }
diff --git a/tc/autotuner/cuda/autotuner.cc b/tc/autotuner/cuda/autotuner.cc
@@ -73,7 +73,7 @@ bool skipExecutionOrWarmup<CudaBackend>(
     typename CudaBackend::ExecutorType& executor,
     const std::vector<const DLTensor*>& outputs,
     const std::vector<const DLConstTensor*>& inputs,
-    size_t bestTimeSoFar) {
+    Duration bestTimeSoFar) {
   // 1. Prune based on the number of threads: if you don't hit at least k warps
   // (default k = 8; 256 total threads, controlled by
   // FLAGS_tuner_min_launch_total_threads) then it's likely the kernel is not
@@ -107,9 +107,8 @@ bool skipExecutionOrWarmup<CudaBackend>(
   auto timings = executor.profile(inputs, outputs);
   // 2.a.
   constexpr size_t kCatastrophicPerfFactor = 100;
-  if (bestTimeSoFar < std::numeric_limits<size_t>::max() and
-      timings.kernelRuntime >= std::chrono::microseconds(
-                                   (kCatastrophicPerfFactor * bestTimeSoFar))) {
+  if (bestTimeSoFar < Duration::max() and
+      timings.kernelRuntime >= bestTimeSoFar * kCatastrophicPerfFactor) {
     return true;
   }
   // 2.b. during autotuning we don't want to spend too much time executing,
@@ -123,9 +122,8 @@ bool skipExecutionOrWarmup<CudaBackend>(
   // catastrophically bad.
   constexpr int kEarlyPruneFactor = 5;
   timings = executor.profile(inputs, outputs);
-  if (bestTimeSoFar < std::numeric_limits<size_t>::max() and
-      timings.kernelRuntime >=
-          std::chrono::microseconds((kEarlyPruneFactor * bestTimeSoFar))) {
+  if (bestTimeSoFar < Duration::max() and
+      timings.kernelRuntime >= bestTimeSoFar * kEarlyPruneFactor) {
     return true;
   }
 
diff --git a/tc/autotuner/genetic_search.cc b/tc/autotuner/genetic_search.cc
@@ -88,9 +88,7 @@ std::vector<double> computeNormalizedFitness(
       population.end(),
       std::back_inserter(fitness),
       [](const std::unique_ptr<CandidateConfiguration>& c) {
-        return 1.0 /
-            std::chrono::duration_cast<std::chrono::microseconds>(c->runtime)
-                .count();
+        return 1.0 / c->runtime.toMicroSeconds();
       });
   normalizeVector(fitness);
   return fitness;
diff --git a/tc/autotuner/options_cache-inl.h b/tc/autotuner/options_cache-inl.h
@@ -98,8 +98,7 @@ OptionsCacheValue<Backend>::toProtobuf() const {
   typename Backend::OptionsCacheValueProtoType buf_value;
   *(buf_value.mutable_kernel_options()) = mappingOptions.proto();
   for (auto d : runtimes) {
-    buf_value.add_recorded_runtimes(
-        std::chrono::duration_cast<std::chrono::microseconds>(d).count());
+    buf_value.add_recorded_runtimes(d.toMicroSeconds());
   }
   return buf_value;
 }
@@ -109,7 +108,7 @@ OptionsCacheValue<Backend> OptionsCacheValue<Backend>::fromProtobuf(
     const typename Backend::OptionsCacheValueProtoType& proto) {
   std::vector<Duration> runtimes;
   for (auto d : proto.recorded_runtimes()) {
-    runtimes.push_back(Duration(d));
+    runtimes.push_back(Duration::fromMicroSeconds(d));
   }
   return OptionsCacheValue<Backend>{
       runtimes, typename Backend::MappingOptionsType(proto.kernel_options())};
diff --git a/tc/autotuner/parameters.h b/tc/autotuner/parameters.h
@@ -22,6 +22,7 @@
 #include "tc/core/cpu/cpu_mapping_options.h"
 #include "tc/core/cuda/cuda_mapping_options.h"
 #include "tc/core/utils/memory.h"
+#include "tc/core/utils/time.h"
 
 #include <llvm/ADT/Optional.h>
 
@@ -227,9 +228,6 @@ class TuningParameterFixer {
   friend class TuningConfiguration;
 };
 
-using TimePoint = std::chrono::high_resolution_clock::time_point;
-using Duration = std::chrono::high_resolution_clock::duration;
-
 class CandidateConfiguration {
  public:
   CandidateConfiguration(
diff --git a/tc/autotuner/utils.cc b/tc/autotuner/utils.cc
@@ -50,12 +50,6 @@ std::vector<std::size_t> powers2andCeilDivisors(std::size_t val) {
   return res;
 }
 
-namespace {
-uint64_t toMicroseconds(const Duration& d) {
-  return std::chrono::duration_cast<std::chrono::microseconds>(d).count();
-}
-} // namespace
-
 void Printer::record(Duration runtime) {
   std::lock_guard<std::mutex> lock(runtimesMtx_);
   runtimes_.push_back(runtime);
@@ -75,11 +69,11 @@ void Printer::printLoop() {
       std::lock_guard<std::mutex> lock(runtimesMtx_);
       if (not runtimes_.empty()) {
         std::sort(runtimes_.begin(), runtimes_.end());
-        auto best = toMicroseconds(runtimes_.front());
-        auto median = toMicroseconds(runtimes_.at(runtimes_.size() / 2));
-        auto worst = toMicroseconds(runtimes_.back());
-        ss << "   (best/median/worst)us: " << best << '/' << median << '/'
-           << worst;
+        auto best = runtimes_.front();
+        auto median = runtimes_.at(runtimes_.size() / 2);
+        auto worst = runtimes_.back();
+        ss << "   (best/median/worst)us: " << best.toMicroSeconds() << '/'
+           << median.toMicroSeconds() << '/' << worst.toMicroSeconds();
       }
     }
     // XXX: platform specific erase current line and move cursor to begining
@@ -121,21 +115,21 @@ void Printer::stop() {
 }
 
 void Printer::printAll() {
-  auto runtimes = [this]() {
-    std::lock_guard<std::mutex> lock(runtimesMtx_);
-    std::sort(runtimes_.begin(), runtimes_.end());
-    std::vector<uint64_t> runtimes;
+  auto getSortedRuntimes = [this]() {
+    std::vector<size_t> runtimes;
     runtimes.reserve(runtimes_.size());
-    std::transform(
-        runtimes_.begin(),
-        runtimes_.end(),
-        std::back_inserter(runtimes),
-        toMicroseconds);
+    {
+      std::lock_guard<std::mutex> lock(runtimesMtx_);
+      for (auto r : runtimes_) {
+        runtimes.push_back(r.toMicroSeconds());
+      }
+    }
+    std::sort(runtimes.begin(), runtimes.end());
     return runtimes;
-  }();
+  };
   LOG_IF(INFO, FLAGS_debug_tuner)
       << "\n [TUNER][ITERATION LOG] median times of each candidate (in us) "
-      << runtimes << std::endl;
+      << getSortedRuntimes() << std::endl;
 }
 } // namespace autotune
 } // namespace tc
diff --git a/tc/benchmarks/benchmark_fixture.h b/tc/benchmarks/benchmark_fixture.h
@@ -123,21 +123,18 @@ struct Benchmark : public ::testing::Test {
       auto timings = tc::aten::profile(*pExecutor, inputs, outputs);
       kernelTimes.push_back(timings.kernelRuntime);
       TC_CUDA_RUNTIMEAPI_ENFORCE(cudaDeviceSynchronize());
-      auto time(std::chrono::system_clock::now());
+      auto start(std::chrono::system_clock::now());
       tc::aten::uncheckedRun(*pExecutor, inputs, outputs);
       TC_CUDA_RUNTIMEAPI_ENFORCE(cudaDeviceSynchronize());
-      totalTimes.push_back(
-          std::chrono::duration_cast<std::chrono::microseconds>(
-              std::chrono::system_clock::now() - time));
+      totalTimes.push_back(tc::Duration::since(start));
     }
 
     auto p50idx = static_cast<int>(std::ceil(0.5 * kernelTimes.size()));
     auto p90idx = static_cast<int>(std::ceil(0.9 * kernelTimes.size()));
     auto p99idx = static_cast<int>(std::ceil(0.99 * kernelTimes.size()));
 
     std::sort(kernelTimes.begin(), kernelTimes.end());
-#define GET_US(X) \
-  (std::chrono::duration_cast<std::chrono::microseconds>((X)).count())
+#define GET_US(X) ((X)).toMicroSeconds()
 
     std::cout << "\n---------------------------------------------------------";
     std::cout << "\n------------------ COMPILED KERNEL STATS ----------------";
@@ -163,8 +160,7 @@ struct Benchmark : public ::testing::Test {
 #undef GET_US
 
     std::sort(totalTimes.begin(), totalTimes.end());
-#define GET_US(X) \
-  (std::chrono::duration_cast<std::chrono::microseconds>((X)).count())
+#define GET_US(X) ((X)).toMicroSeconds()
 
     std::cout << "\n---------------------------------------------------------";
     std::cout << "\n------------------ COMPILED TOTAL STATS ----------------";
@@ -199,19 +195,17 @@ struct Benchmark : public ::testing::Test {
     std::vector<tc::Duration> times;
     times.reserve(tc::FLAGS_benchmark_iterations);
     for (size_t i = 0; i < tc::FLAGS_benchmark_iterations; ++i) {
-      auto time(std::chrono::system_clock::now());
+      auto start(std::chrono::system_clock::now());
       compute(res);
       TC_CUDA_RUNTIMEAPI_ENFORCE(cudaDeviceSynchronize());
-      times.push_back(std::chrono::duration_cast<std::chrono::microseconds>(
-          std::chrono::system_clock::now() - time));
+      times.push_back(tc::Duration::since(start));
     }
     std::sort(times.begin(), times.end());
     auto p50idx = static_cast<int>(std::ceil(0.5 * times.size()));
     auto p90idx = static_cast<int>(std::ceil(0.9 * times.size()));
     auto p99idx = static_cast<int>(std::ceil(0.99 * times.size()));
 
-#define GET_US(X) \
-  (std::chrono::duration_cast<std::chrono::microseconds>((X)).count())
+#define GET_US(X) ((X)).toMicroSeconds()
 
     std::cout << "\n---------------------------------------------------------";
     std::cout << "\n------------------ REFERENCE IMPL. STATS ----------------";
@@ -285,21 +279,18 @@ struct Benchmark : public ::testing::Test {
       auto timings = tc::aten::profile(*pExecutor, inputs, outputs);
       kernelTimes.push_back(timings.kernelRuntime);
       TC_CUDA_RUNTIMEAPI_ENFORCE(cudaDeviceSynchronize());
-      auto time(std::chrono::system_clock::now());
+      auto start(std::chrono::system_clock::now());
       tc::aten::uncheckedRun(*pExecutor, inputs, outputs);
       TC_CUDA_RUNTIMEAPI_ENFORCE(cudaDeviceSynchronize());
-      totalTimes.push_back(
-          std::chrono::duration_cast<std::chrono::microseconds>(
-              std::chrono::system_clock::now() - time));
+      totalTimes.push_back(tc::Duration::since(start));
     }
 
     auto p50idx = static_cast<int>(std::ceil(0.5 * kernelTimes.size()));
     auto p90idx = static_cast<int>(std::ceil(0.9 * kernelTimes.size()));
     auto p99idx = static_cast<int>(std::ceil(0.99 * kernelTimes.size()));
 
     std::sort(kernelTimes.begin(), kernelTimes.end());
-#define GET_US(X) \
-  (std::chrono::duration_cast<std::chrono::microseconds>((X)).count())
+#define GET_US(X) ((X)).toMicroSeconds()
 
     std::cout << "\n---------------------------------------------------------";
     std::cout << "\n------------- AUTOTUNED VALIDATED KERNEL STATS ----------";
@@ -325,8 +316,7 @@ struct Benchmark : public ::testing::Test {
 #undef GET_US
 
     std::sort(totalTimes.begin(), totalTimes.end());
-#define GET_US(X) \
-  (std::chrono::duration_cast<std::chrono::microseconds>((X)).count())
+#define GET_US(X) ((X)).toMicroSeconds()
 
     std::cout << "\n---------------------------------------------------------";
     std::cout << "\n-------------- AUTOTUNED VALIDATED TOTAL STATS ----------";
@@ -392,21 +382,18 @@ struct Benchmark : public ::testing::Test {
         auto timings = tc::aten::profile(*pExecutor, inputs, outputs);
         kernelTimes.push_back(timings.kernelRuntime);
         TC_CUDA_RUNTIMEAPI_ENFORCE(cudaDeviceSynchronize());
-        auto time(std::chrono::system_clock::now());
+        auto start(std::chrono::system_clock::now());
         tc::aten::uncheckedRun(*pExecutor, inputs, outputs);
         TC_CUDA_RUNTIMEAPI_ENFORCE(cudaDeviceSynchronize());
-        totalTimes.push_back(
-            std::chrono::duration_cast<std::chrono::microseconds>(
-                std::chrono::system_clock::now() - time));
+        totalTimes.push_back(tc::Duration::since(start));
       }
 
       auto p50idx = static_cast<int>(std::ceil(0.5 * kernelTimes.size()));
       auto p90idx = static_cast<int>(std::ceil(0.9 * kernelTimes.size()));
       auto p99idx = static_cast<int>(std::ceil(0.99 * kernelTimes.size()));
       std::sort(kernelTimes.begin(), kernelTimes.end());
 
-#define GET_US(X) \
-  (std::chrono::duration_cast<std::chrono::microseconds>((X)).count())
+#define GET_US(X) ((X)).toMicroSeconds()
 
       {
         std::ofstream out(resultsFilename);
diff --git a/tc/core/cpu/cpu_tc_executor.cc b/tc/core/cpu/cpu_tc_executor.cc
@@ -60,8 +60,6 @@ ProfilingInfo CpuTcExecutor::profileUnchecked(
     const std::vector<const void*>& inputs,
     const std::vector<void*>& outputs) const {
   LOG(ERROR) << "NYI: CpuTcExecutor::profileUnchecked";
-  return ProfilingInfo{
-      Duration(std::chrono::microseconds(static_cast<int64_t>(999999999999))),
-      Duration(std::chrono::microseconds(static_cast<int64_t>(999999999999)))};
+  return ProfilingInfo{Duration::max(), Duration::max()};
 }
 } // namespace tc
diff --git a/tc/core/cuda/cuda_rtc.cc b/tc/core/cuda/cuda_rtc.cc
@@ -202,6 +202,6 @@ Duration CudaRTCFunction::Launch(
   TC_CUDA_RUNTIMEAPI_ENFORCE(cudaEventElapsedTime(&milliseconds, start, stop));
   TC_CUDA_RUNTIMEAPI_ENFORCE(cudaEventDestroy(start));
   TC_CUDA_RUNTIMEAPI_ENFORCE(cudaEventDestroy(stop));
-  return std::chrono::microseconds(static_cast<int64_t>(milliseconds * 1000));
+  return Duration::fromMicroSeconds(milliseconds * 1000);
 }
 } // namespace tc
diff --git a/tc/core/cuda/cuda_tc_executor.cc b/tc/core/cuda/cuda_tc_executor.cc
@@ -144,8 +144,7 @@ ProfilingInfo CudaTcExecutor::profileUnchecked(
       inputs,
       true));
   // The CPU overhead is the total time minus the (synchronized) kernel runtime
-  auto end = std::chrono::system_clock::now();
-  Duration cpuOverhead(end - start);
+  Duration cpuOverhead(Duration::since(start));
   cpuOverhead = cpuOverhead - kernelRuntime;
   return ProfilingInfo{cpuOverhead, kernelRuntime};
 }
diff --git a/tc/core/tc_executor-inl.h b/tc/core/tc_executor-inl.h
@@ -138,8 +138,7 @@ ProfilingInfo TcExecutor<Backend>::profile(
 
   // The total CPU overhead is the total time minus the (synchronized) kernel
   // runtime
-  auto end = std::chrono::system_clock::now();
-  Duration cpuOverhead(end - start);
+  Duration cpuOverhead(Duration::since(start));
   cpuOverhead = cpuOverhead - pi.kernelRuntime;
   return ProfilingInfo{cpuOverhead, pi.kernelRuntime};
 }
diff --git a/tc/core/utils/math.h b/tc/core/utils/math.h
@@ -33,7 +33,7 @@ typename V::value_type median(V v) {
   }
   auto rightElement = v.at(n / 2);
   std::nth_element(v.begin(), v.begin() + n / 2 - 1, v.end());
-  return (v.at(n / 2 - 1) + rightElement) / 2;
+  return (v.at(n / 2 - 1) + rightElement) / 2u;
 }
 
 } // namespace tc
diff --git a/tc/core/utils/time.h b/tc/core/utils/time.h
diff --git a/tc/examples/blockdiagperm.cc b/tc/examples/blockdiagperm.cc
diff --git a/tc/examples/tensordot.cc b/tc/examples/tensordot.cc
diff --git a/tc/examples/wavenet.cc b/tc/examples/wavenet.cc
diff --git a/test/cuda/test_compilation_cache.cc b/test/cuda/test_compilation_cache.cc
diff --git a/test/test_harness_aten_cuda.h b/test/test_harness_aten_cuda.h

Original file line number	Diff line number	Diff line change
`@@ -61,8 +61,7 @@ ProfilingInfo profile(`
`61`	`61`
`62`	`62`	`// The total CPU overhead is the total time minus the (synchronized) kernel`
`63`	`63`	`// runtime`
`64`		`- auto end = std::chrono::system_clock::now();`
`65`		`- Duration cpuOverhead(end - start);`
	`64`	`+ Duration cpuOverhead(Duration::since(start));`
`66`	`65`	`cpuOverhead = cpuOverhead - pi.kernelRuntime;`
`67`	`66`	`return ProfilingInfo{cpuOverhead, pi.kernelRuntime};`
`68`	`67`	`}`
Original file line number	Diff line number	Diff line change
`@@ -59,7 +59,7 @@ bool skipExecutionOrWarmup<CpuBackend>(`
`59`	`59`	`typename CpuBackend::ExecutorType& executor,`
`60`	`60`	`const std::vector<const DLTensor*>& outputs,`
`61`	`61`	`const std::vector<const DLConstTensor*>& inputs,`
`62`		`- size_t bestTimeSoFar) {`
	`62`	`+ Duration bestTimeSoFar) {`
`63`	`63`	`LOG(ERROR) << "NYI: skipExecutionOrWarmup<CpuBackend>";`
`64`	`64`	`return false;`
`65`	`65`	`}`
Original file line number	Diff line number	Diff line change
`@@ -98,8 +98,7 @@ OptionsCacheValue<Backend>::toProtobuf() const {`
`98`	`98`	`typename Backend::OptionsCacheValueProtoType buf_value;`
`99`	`99`	`*(buf_value.mutable_kernel_options()) = mappingOptions.proto();`
`100`	`100`	`for (auto d : runtimes) {`
`101`		`- buf_value.add_recorded_runtimes(`
`102`		`- std::chrono::duration_cast<std::chrono::microseconds>(d).count());`
	`101`	`+ buf_value.add_recorded_runtimes(d.toMicroSeconds());`
`103`	`102`	`}`
`104`	`103`	`return buf_value;`
`105`	`104`	`}`
`@@ -109,7 +108,7 @@ OptionsCacheValue<Backend> OptionsCacheValue<Backend>::fromProtobuf(`
`109`	`108`	`const typename Backend::OptionsCacheValueProtoType& proto) {`
`110`	`109`	`std::vector<Duration> runtimes;`
`111`	`110`	`for (auto d : proto.recorded_runtimes()) {`
`112`		`- runtimes.push_back(Duration(d));`
	`111`	`+ runtimes.push_back(Duration::fromMicroSeconds(d));`
`113`	`112`	`}`
`114`	`113`	`return OptionsCacheValue<Backend>{`
`115`	`114`	`runtimes, typename Backend::MappingOptionsType(proto.kernel_options())};`