Remove best options fro autotuner and always recover from cache

nicolasvasilache · nicolasvasilache · commit 2374c7540d87 · 2018-07-23T05:28:15.000-07:00
This commit supposedly addresses issue facebookresearch#523 (only supposedly because there is no easy repro). The problem is conjectured to come from the tuner keeping the best time/option in a private field whereas the functions that interact with the cache files operate on the cache. When multiple entries have the same runtime, it is conjectured (by @ftynse) that the ordering of the cache entries do not match the private field. In hindsight this can easily happen with thread/block sizes because once the number of threads/blocks is one per loop element, one can increase the values passed to mapping options but the same code will be generated after tightening. It is not too much of a stretch to imagine that the same code will occasionally have the same runtime. This commit drops the private state and ensures we always fetch the requires values from the options cache (under its lock).
diff --git a/tc/aten/aten_autotuner-inl.h b/tc/aten/aten_autotuner-inl.h
@@ -53,6 +53,7 @@ ATenAutotuner<Backend, Search>::tune(
     const std::string& tcName,
     const std::vector<at::Tensor>& inputs,
     const std::vector<typename Backend::MappingOptionsType>& baseMappings,
+    size_t topK,
     const tc::autotune::TuningParameterFixer& fixedParams) {
   // TODO: some checks that inputs memory lives on the proper Backend device
 
@@ -91,6 +92,7 @@ ATenAutotuner<Backend, Search>::tune(
       rawInputsPerDevice,
       rawOutputsPerDevice,
       baseMappings,
+      topK,
       fixedParams);
 }
 } // namespace aten
diff --git a/tc/aten/aten_autotuner.h b/tc/aten/aten_autotuner.h
@@ -80,6 +80,7 @@ class ATenAutotuner : public tc::autotune::Autotuner<Backend, SearchStrategy> {
       const std::string& tcEntryPoint,
       const std::vector<at::Tensor>& inputs,
       const std::vector<MappingOptionsType>& baseMappings,
+      size_t topK = 1,
       const tc::autotune::TuningParameterFixer& fixedParams = {});
 
  protected:
diff --git a/tc/autotuner/autotuner-inl.h b/tc/autotuner/autotuner-inl.h
@@ -15,6 +15,7 @@
  */
 #include <atomic>
 #include <chrono>
+#include <functional>
 #include <numeric>
 #include <thread>
 
@@ -48,8 +49,6 @@ TuningHarness<Backend>::TuningHarness(
       baseMapping_(baseMapping),
       inputs_(inputs),
       outputs_(outputs),
-      bestTime_(Duration::max()),
-      bestMappingOptions_(baseMapping),
       optionsCache_(optionsCache) {}
 
 template <typename Backend>
@@ -67,13 +66,6 @@ void TuningHarness<Backend>::stopAfterCurrentIteration() {
   stopRequested_ = true;
 }
 
-template <typename Backend>
-const typename Backend::MappingOptionsType&
-TuningHarness<Backend>::bestMappingOptions() const {
-  std::lock_guard<std::mutex> lock(bestTimeMutex_);
-  return bestMappingOptions_;
-}
-
 #define LOG_LINE_BY_LINE(GSTREAM, ISTREAM)               \
   for (std::string line; std::getline(ISTREAM, line);) { \
     LOG(GSTREAM) << line;                                \
@@ -180,11 +172,13 @@ void TuningHarness<Backend>::doEvaluate(
 
     std::vector<Duration> runtimes{Duration::max()};
     try {
-      Duration bestTimeSoFar(Duration::max());
-      {
-        std::lock_guard<std::mutex> lock(bestTimeMutex_);
-        bestTimeSoFar = bestTime_;
-      }
+      auto vBest = optionsCache_->getTopKEntries(
+          lang::canonicalTc(tcTree_),
+          makeTensorInfoVector(inputs),
+          makeTensorInfoVector(outputs),
+          Backend::backendString(),
+          1);
+      Duration bestTimeSoFar = vBest[0].second;
       auto prune = detail::skipExecutionOrWarmup<Backend>(
           *pExecutor, outputs, inputs, bestTimeSoFar);
       if (prune) {
@@ -234,15 +228,6 @@ void TuningHarness<Backend>::doEvaluate(
         Backend::backendString(),
         options,
         prof);
-
-    // Save best time under lock
-    {
-      std::lock_guard<std::mutex> lock(bestTimeMutex_);
-      if (prof < bestTime_) {
-        bestTime_ = prof;
-        bestMappingOptions_ = options;
-      }
-    }
   } // end while
 }
 
@@ -310,7 +295,13 @@ void TuningHarness<Backend>::runOneIteration(
     LOG(INFO) << "[TUNER][ITERATION LOG] best option so far:";
     std::stringstream ssInfo;
     typename Backend::MappingOptionsCppPrinter infoPrinter(ssInfo);
-    infoPrinter << bestMappingOptions();
+    auto vBest = optionsCache_->getTopKOptions(
+        lang::canonicalTc(tcTree_),
+        makeTensorInfoVector(inputs_.begin()->second),
+        makeTensorInfoVector(outputs_.begin()->second),
+        Backend::backendString(),
+        1);
+    infoPrinter << vBest[0];
     LOG_LINE_BY_LINE(INFO, ssInfo);
   }
   searchStrategy.updateParameters();
@@ -426,6 +417,7 @@ Autotuner<Backend, SearchStrategy>::tune(
     const std::unordered_map<size_t, std::vector<const DLConstTensor*>>& inputs,
     std::unordered_map<size_t, std::vector<const DLTensor*>>& outputs,
     const std::vector<typename Backend::MappingOptionsType>& baseMappings,
+    size_t topK,
     const TuningParameterFixer& fixedParams) {
   std::map<std::string, lang::TreeRef> tcEntryPointMap(tc::detail::parse(tc));
   TC_CHECK_EQ(tcEntryPointMap.count(tcEntryPoint), 1u)
@@ -511,7 +503,12 @@ Autotuner<Backend, SearchStrategy>::tune(
     std::rethrow_exception(tuningHarnessThreadEx);
   }
 
-  return {tuningHarness.bestMappingOptions()};
+  return optionsCache->getTopKOptions(
+      lang::canonicalTc(tcEntryPointMap.at(tcEntryPoint)),
+      makeTensorInfoVector(inputs.begin()->second),
+      makeTensorInfoVector(outputs.begin()->second),
+      Backend::backendString(),
+      topK);
 }
 } // namespace autotune
 } // namespace tc
diff --git a/tc/autotuner/autotuner.h b/tc/autotuner/autotuner.h
@@ -67,9 +67,6 @@ class TuningHarness {
   /// TODO: we should detect when we come from python and exit properly in C++.
   void stopAfterCurrentIteration();
 
-  /// Under lock, returns the best mapping options found so far
-  const MappingOptionsType& bestMappingOptions() const;
-
  private:
   /// Traverse one iteration of candidates in parallel and evaluate their
   /// runtimes
@@ -92,7 +89,6 @@ class TuningHarness {
   /// This way it is easy to implement multi-threaded termination by just
   /// taking an atomic counter and pushing/popping the queues under lock until
   /// we have evaluated searchStrategy->population.size() compilation results.
-  mutable std::mutex bestTimeMutex_;
   std::mutex executorsMutex_;
   std::atomic_bool stopRequested_;
   std::atomic_size_t currentCompilationJob_;
@@ -112,10 +108,6 @@ class TuningHarness {
   const std::unordered_map<size_t, std::vector<const DLConstTensor*>> inputs_;
   std::unordered_map<size_t, std::vector<const DLTensor*>> outputs_;
 
-  // results
-  Duration bestTime_;
-  MappingOptionsType bestMappingOptions_;
-
   // backing options cache
   std::shared_ptr<OptionsCache<Backend>> optionsCache_;
 };
@@ -165,6 +157,7 @@ class Autotuner {
           inputs,
       std::unordered_map<size_t, std::vector<const DLTensor*>>& outputs,
       const std::vector<MappingOptionsType>& baseMapping,
+      size_t topK = 1,
       const TuningParameterFixer& fixedParams = TuningParameterFixer());
 
  public:
diff --git a/tc/autotuner/options_cache-inl.h b/tc/autotuner/options_cache-inl.h
@@ -28,6 +28,7 @@
 
 #include "tc/core/check.h"
 #include "tc/core/compiler.h"
+#include "tc/core/functional.h"
 #include "tc/core/tensor.h"
 #include "tc/core/utils/math.h"
 #include "tc/core/utils/time.h"
@@ -235,8 +236,8 @@ std::vector<OptionsWithMedianAndRuntimes<Backend>> sortedOptions(
 } // namespace detail
 
 template <typename Backend>
-std::vector<typename Backend::MappingOptionsType>
-OptionsCache<Backend>::getTopKOptions(
+std::vector<std::pair<typename Backend::MappingOptionsType, Duration>>
+OptionsCache<Backend>::getTopKEntries(
     const lang::CanonicalTcString& tc,
     const std::vector<TensorInfo>& inputs,
     const std::vector<TensorInfo>& outputs,
@@ -249,15 +250,32 @@ OptionsCache<Backend>::getTopKOptions(
   if (sorted.size() == 0u) {
     return {};
   }
-  std::vector<typename Backend::MappingOptionsType> res;
+  std::vector<std::pair<typename Backend::MappingOptionsType, Duration>> res;
   res.reserve(K);
   for (size_t i = 0; i < std::min(K, sorted.size()); ++i) {
-    res.push_back(sorted[i].mappingOptions);
+    res.push_back(std::make_pair(sorted[i].mappingOptions, sorted[i].median));
   }
   ++numberSuccessfulRetrievals;
   return res;
 }
 
+template <typename Backend>
+std::vector<typename Backend::MappingOptionsType>
+OptionsCache<Backend>::getTopKOptions(
+    const lang::CanonicalTcString& tc,
+    const std::vector<TensorInfo>& inputs,
+    const std::vector<TensorInfo>& outputs,
+    const std::string& backendStr,
+    size_t K) const {
+  auto vBest = getTopKEntries(tc, inputs, outputs, backendStr, K);
+  using ReturnType = typename Backend::MappingOptionsType;
+  using ValueType = typename decltype(vBest)::value_type;
+  std::function<ReturnType(ValueType)> map = [](ValueType in) {
+    return in.first;
+  };
+  return tc::functional::Map(map, vBest);
+}
+
 template <typename Backend>
 std::unordered_set<OptionsCacheKey, OptionsCacheKeyHash>
 OptionsCache<Backend>::getKeys() const {
diff --git a/tc/autotuner/options_cache.h b/tc/autotuner/options_cache.h
@@ -132,6 +132,19 @@ struct OptionsCache {
   /// particular TC/inputs/outputs/device. Note that the result may be empty
   /// (in particular if problem size is small and pruning threshold is too high
   /// for the problem size).
+  /// \returns a vector of pair<mapping options, Duration>
+  std::vector<std::pair<typename Backend::MappingOptionsType, Duration>>
+  getTopKEntries(
+      const lang::CanonicalTcString& tc,
+      const std::vector<TensorInfo>& inputs,
+      const std::vector<TensorInfo>& outputs,
+      const std::string& backendStr,
+      size_t K) const;
+
+  /// Returns the top-K mapping options that have the best median runtime for a
+  /// particular TC/inputs/outputs/device. Note that the result may be empty
+  /// (in particular if problem size is small and pruning threshold is too high
+  /// for the problem size).
   /// \returns a vector of mapping options
   std::vector<typename Backend::MappingOptionsType> getTopKOptions(
       const lang::CanonicalTcString& tc,
diff --git a/tc/core/functional.h b/tc/core/functional.h
@@ -21,7 +21,6 @@
 #include <vector>
 
 namespace tc {
-namespace polyhedral {
 namespace functional {
 
 template <typename I>
@@ -178,5 +177,4 @@ R MapReduce(std::function<R(R, I, bool)> fun, const std::vector<I>& vec) {
 }
 
 } // namespace functional
-} // namespace polyhedral
 } // namespace tc
diff --git a/tensor_comprehensions/pybinds/tclib.cc b/tensor_comprehensions/pybinds/tclib.cc
@@ -33,6 +33,7 @@
 #include "tc/core/cuda/cuda_backend.h"
 #include "tc/core/cuda/cuda_tc_executor.h"
 #include "tc/core/flags.h"
+#include "tc/core/functional.h"
 #include "tc/core/tensor.h"
 #include "tc/lang/canonicalize.h"
 
diff --git a/test/cuda/test_autotuner.cc b/test/cuda/test_autotuner.cc
@@ -72,7 +72,8 @@ struct ATenCompilationUnitTest : public ::testing::Test {
     if (FLAGS_no_memory_promotion) {
       fix.fixUseSharedMemory(false).fixUsePrivateMemory(false);
     }
-    auto options = geneticAutotuneATen.tune(name, inputs, {baseMapping}, fix);
+    auto options = geneticAutotuneATen.tune(
+        name, inputs, {baseMapping}, std::numeric_limits<size_t>::max(), fix);
     if (options.size() > 0) {
       return options[0];
     }

Original file line number	Diff line number	Diff line change
`@@ -72,7 +72,8 @@ struct ATenCompilationUnitTest : public ::testing::Test {`
`72`	`72`	`if (FLAGS_no_memory_promotion) {`
`73`	`73`	`fix.fixUseSharedMemory(false).fixUsePrivateMemory(false);`
`74`	`74`	`}`
`75`		`- auto options = geneticAutotuneATen.tune(name, inputs, {baseMapping}, fix);`
	`75`	`+ auto options = geneticAutotuneATen.tune(`
	`76`	`+ name, inputs, {baseMapping}, std::numeric_limits<size_t>::max(), fix);`
`76`	`77`	`if (options.size() > 0) {`
`77`	`78`	`return options[0];`
`78`	`79`	`}`