facebookresearch
diff --git a/‎README.md
Lines changed: 23 additions & 53 deletions b/‎README.md
Lines changed: 23 additions & 53 deletions
diff --git a/‎tc/aten/aten_autotuner-inl.h
Lines changed: 2 additions & 4 deletions b/‎tc/aten/aten_autotuner-inl.h
Lines changed: 2 additions & 4 deletions
diff --git a/‎tc/aten/aten_autotuner.h
Lines changed: 1 addition & 2 deletions b/‎tc/aten/aten_autotuner.h
Lines changed: 1 addition & 2 deletions
diff --git a/‎tc/autotuner/autotuner-inl.h
Lines changed: 8 additions & 71 deletions b/‎tc/autotuner/autotuner-inl.h
Lines changed: 8 additions & 71 deletions
diff --git a/‎tc/autotuner/autotuner.h
Lines changed: 6 additions & 4 deletions b/‎tc/autotuner/autotuner.h
Lines changed: 6 additions & 4 deletions
diff --git a/‎tc/autotuner/options_cache-inl.h
Lines changed: 36 additions & 9 deletions b/‎tc/autotuner/options_cache-inl.h
Lines changed: 36 additions & 9 deletions
@@ -41,29 +41,27 @@ def tensordot(float(N, C1, C2, H, W) I0,
     O(n, c1, c3, h, w) +=! I0(n, c1, r_c2, h, w) * I1(n, r_c2, c3, h, w)
 }
   )TC";
-  tc::ATenCompilationUnit<tc::CudaBackend> atCompl;
-  atCompl.define(tc);
 
   // 2. Allocate tensors with random data.
   at::Tensor I0 = at::CUDA(at::kFloat).rand({32,  8, 16, 17, 25});
   at::Tensor I1 = at::CUDA(at::kFloat).rand({32, 16, 2, 17, 25});
 
   // 3. Run autotuning with evolutionary search starting from a naive option.
-  auto options = tc::CudaMappingOptions::makeNaiveMappingOptions();
-  tc::autotune::GeneticAutotunerATen geneticAutotuneATen(tc);
-  auto bestOption = geneticAutotuneATen.tune(
-    "/tmp/save_results", "tensordot", {I0, I1}, options);
-
-  // 4. Compile and run the TC with the best option.
-  // Outputs get allocated; could also be pre-allocated and passed.
-  auto handle = atCompl.compile("tensordot", {I0, I1}, bestOption.getValue());
-  std::vector<at::Tensor> outputs;
-  auto duration = atCompl.run("tensordot", {I0, I1}, outputs, handle, true);
-  std::cout
-       << "tensordot size I0: " << I0.sizes() << ", "
-       << "size I1: " << I1.sizes() << " ran in: "
-       << std::chrono::duration_cast<std::chrono::microseconds>(duration).count()
-       << "us\n";
+  auto naiveOptions = Backend::MappingOptionsType::makeNaiveMappingOptions();
+  tc::aten::ATenAutotuner<tc::CudaBackend, tc::autotune::GeneticSearch>
+      geneticAutotuneATen(tc);
+  auto bestOption =
+      geneticAutotuneATen.tune("tensordot", {I0, I1}, {naiveOptions});
+
+  // 4. Compile and run the TC with the best option after allocating output
+  //    tensors.
+  auto pExecutor =
+      tc::aten::compile<Backend>(tc, "tensordot", {I0, I1}, bestOption[0]);
+  auto outputs = tc::aten::prepareOutputs(tc, "tensordot", {I0, I1});
+  auto timings = tc::aten::profile(*pExecutor, {I0, I1}, outputs);
+  std::cout << "tensordot size I0: " << I0.sizes() << ", "
+            << "size I1: " << I1.sizes()
+            << " ran in: " << timings.kernelRuntime.toMicroSeconds() << "us\n";
 }
 ```
 
@@ -76,15 +74,15 @@ for (auto sizes : std::vector<std::pair<at::IntList, at::IntList>>{
          {{4, 9, 7, 16, 14}, {4, 7, 3, 16, 14}},
          {{8, 5, 11, 10, 10}, {8, 11, 16, 10, 10}},
      }) {
-  at::Tensor I0 = at::CUDA(at::kFloat).rand(sizes.first);
-  at::Tensor I1 = at::CUDA(at::kFloat).rand(sizes.second);
-  auto handle = atCompl.compile("tensordot", {I0, I1}, bestOption.getValue());
-  std::vector<at::Tensor> outputs;
-  auto duration = atCompl.run("tensordot", {I0, I1}, outputs, handle, true);
+  at::Tensor I0 = makeATenTensor<Backend>(sizes.first);
+  at::Tensor I1 = makeATenTensor<Backend>(sizes.second);
+  auto pExecutor =
+      tc::aten::compile<Backend>(tc, "tensordot", {I0, I1}, bestOption[0]);
+  auto outputs = tc::aten::prepareOutputs(tc, "tensordot", {I0, I1});
+  auto timings = tc::aten::profile(*pExecutor, {I0, I1}, outputs);
   std::cout << "tensordot size I0: " << I0.sizes() << ", "
-            << "size I1: " << I1.sizes() << " ran in: "
-            << std::chrono::duration_cast<std::chrono::microseconds>(duration)
-                   .count()
+            << "size I1: " << I1.sizes()
+            << " ran in: " << timings.kernelRuntime.toMicroSeconds()
             << "us\n";
 }
 ```
@@ -96,11 +94,9 @@ Putting it all together, one may see:
 [----------] Global test environment set-up.
 [----------] 1 test from TensorDot
 [ RUN      ] TensorDot.SimpleAutotune
-Loading proto from: /tmp/save_results.options and /tmp/save_results.cuda
 Generation 0    Jobs(Compiled, GPU)/total  (10, 10)/10   (best/median/worst)us: 226/4238/7345
 Generation 1    Jobs(Compiled, GPU)/total  (10, 10)/10   (best/median/worst)us: 220/221/233
 Generation 2    Jobs(Compiled, GPU)/total  (10, 10)/10   (best/median/worst)us: 220/221/234
-Dumping cache to /tmp/save_results.cuda/options
 tensordot size I0: [16, 8, 16, 17, 25], size I1: [16, 16, 2, 17, 25] ran in: 239us
 tensordot size I0: [4, 9, 7, 16, 14], size I1: [4, 7, 3, 16, 14] ran in: 56us
 tensordot size I0: [8, 5, 11, 10, 10], size I1: [8, 11, 16, 10, 10] ran in: 210us
@@ -112,32 +108,6 @@ tensordot size I0: [8, 5, 11, 10, 10], size I1: [8, 11, 16, 10, 10] ran in: 210u
 [  PASSED  ] 1 test.
 ```
 
-Tuning results are then available and reusable in ```/tmp/save_results.cuda``` and ```/tmp/save_results.proto```.
-
-Interestingly, note that running the same example again will start form the best saved results and improve upon them.
-Of course this has diminishing returns:
-```shell
-> build$ ./examples/example_simple
-[==========] Running 1 test from 1 test case.
-[----------] Global test environment set-up.
-[----------] 1 test from TensorDot
-[ RUN      ] TensorDot.SimpleAutotune
-Loading proto from: /tmp/save_results.options and /tmp/save_results.cuda
-Generation 0    Jobs(Compiled, GPU)/total  (10, 10)/10   (best/median/worst)us: 256/258/270
-Generation 1    Jobs(Compiled, GPU)/total  (10, 10)/10   (best/median/worst)us: 158/255/616
-Generation 2    Jobs(Compiled, GPU)/total  (10, 10)/10   (best/median/worst)us: 157/252/720
-Dumping cache to /tmp/save_results.cuda/options
-tensordot size I0: [16, 8, 16, 17, 25], size I1: [16, 16, 2, 17, 25] ran in: 172us
-tensordot size I0: [4, 9, 7, 16, 14], size I1: [4, 7, 3, 16, 14] ran in: 44us
-tensordot size I0: [8, 5, 11, 10, 10], size I1: [8, 11, 16, 10, 10] ran in: 88us
-[       OK ] TensorDot.SimpleAutotune (28232 ms)
-[----------] 1 test from TensorDot (28232 ms total)
-
-[----------] Global test environment tear-down
-[==========] 1 test from 1 test case ran. (28232 ms total)
-[  PASSED  ] 1 test.
-```
-
 We have not yet characterized the precise fraction of peak performance we obtain but it is not uncommon to obtain 80%+ of peak shared memory bandwidth after autotuning. Solid register-level optimizations are still in the work but TC in its current form already addresses the productivity gap between the needs of research and the needs of production. Which is why we are excited to share it with the entire community and bring this collaborative effort in the open.
 
 # Documentation
 
@@ -52,8 +52,7 @@ std::vector<typename Backend::MappingOptionsType>
 ATenAutotuner<Backend, Search>::tune(
     const std::string& tcName,
     const std::vector<at::Tensor>& inputs,
-    const typename Backend::MappingOptionsType& baseMapping,
-    const std::string& cacheFileName,
+    const std::vector<typename Backend::MappingOptionsType>& baseMappings,
     const tc::autotune::TuningParameterFixer& fixedParams) {
   // TODO: some checks that inputs memory lives on the proper Backend device
 
@@ -91,8 +90,7 @@ ATenAutotuner<Backend, Search>::tune(
       tcName,
       rawInputsPerDevice,
       rawOutputsPerDevice,
-      baseMapping,
-      cacheFileName,
+      baseMappings,
       fixedParams);
 }
 } // namespace aten
 
@@ -79,8 +79,7 @@ class ATenAutotuner : public tc::autotune::Autotuner<Backend, SearchStrategy> {
   std::vector<MappingOptionsType> tune(
       const std::string& tcEntryPoint,
       const std::vector<at::Tensor>& inputs,
-      const MappingOptionsType& baseMapping,
-      const std::string& cacheFileName = "",
+      const std::vector<MappingOptionsType>& baseMappings,
       const tc::autotune::TuningParameterFixer& fixedParams = {});
 
  protected:
 
@@ -321,45 +321,6 @@ namespace {
 volatile std::sig_atomic_t sigint_ = 0;
 volatile std::sig_atomic_t sigterm_ = 0;
 
-template <typename Backend>
-std::vector<typename Backend::MappingOptionsType> loadThroughCache(
-    lang::TreeRef tree,
-    std::shared_ptr<OptionsCache<Backend>> optionsCache,
-    const std::string& cacheFileName,
-    const std::vector<const DLConstTensor*>& inputs,
-    const size_t numCandidates) {
-  LOG_IF(INFO, FLAGS_debug_tuner)
-      << "Loading proto from: " << tc::makeOptionsFilename(cacheFileName)
-      << std::endl;
-  if (!cacheFileName.empty()) {
-    optionsCache->loadCacheFromFile(tc::makeOptionsFilename(cacheFileName));
-  }
-  auto outputs = tc::detail::inferOutputTensorInfo(tree, inputs);
-  return optionsCache->getTopKOptions(
-      canonicalTc(tree),
-      makeTensorInfoVector(inputs),
-      outputs,
-      Backend::backendString(),
-      numCandidates);
-}
-
-template <typename Backend>
-void storeTopKInCache(
-    const std::shared_ptr<OptionsCache<Backend>>& optionsCache,
-    const std::string& cacheFilename) {
-  if (cacheFilename.empty()) {
-    LOG_IF(INFO, FLAGS_debug_tuner)
-        << "No filepath provided, not saving cache" << std::endl;
-  } else {
-    LOG_IF(INFO, FLAGS_debug_tuner)
-        << "Dumping cache to " << tc::makeOptionsFilename(cacheFilename)
-        << std::endl;
-    OptionsCache<Backend> cache(*optionsCache);
-    cache.pruneKeepTopK(tc::FLAGS_tuner_save_best_candidates_count);
-    cache.storeCacheToFile(tc::makeOptionsFilename(cacheFilename));
-  }
-}
-
 void removeDuplicates(std::vector<size_t>& v) {
   std::sort(v.begin(), v.end());
   v.erase(std::unique(v.begin(), v.end()), v.end());
@@ -416,7 +377,7 @@ void setupTuningParameters(
 
 template <typename Backend, typename SearchStrategy>
 Autotuner<Backend, SearchStrategy>::Autotuner()
-    : optionsCache_(new OptionsCache<Backend>()) {}
+    : optionsCache(new OptionsCache<Backend>()) {}
 
 template <typename Backend, typename SearchStrategy>
 std::vector<typename Backend::MappingOptionsType>
@@ -425,8 +386,7 @@ Autotuner<Backend, SearchStrategy>::tune(
     const std::string& tcEntryPoint,
     const std::unordered_map<size_t, std::vector<const DLConstTensor*>>& inputs,
     std::unordered_map<size_t, std::vector<const DLTensor*>>& outputs,
-    const typename Backend::MappingOptionsType& baseMapping,
-    const std::string& cacheFileName,
+    const std::vector<typename Backend::MappingOptionsType>& baseMappings,
     const TuningParameterFixer& fixedParams) {
   std::map<std::string, lang::TreeRef> tcEntryPointMap(tc::detail::parse(tc));
   TC_CHECK_EQ(tcEntryPointMap.count(tcEntryPoint), 1u)
@@ -438,28 +398,13 @@ Autotuner<Backend, SearchStrategy>::tune(
   setupTuningParameters(inputs.begin()->second, modelConfiguration);
   modelConfiguration.fixParameters(fixedParams);
 
-  // Build starting points from baseMapping + whatever we recover from cache
-  std::vector<typename Backend::MappingOptionsType> startingPoints{baseMapping};
-  auto restoredCandidates = loadThroughCache<Backend>(
-      tcEntryPointMap.at(tcEntryPoint),
-      optionsCache_,
-      cacheFileName,
-      inputs.begin()->second,
-      FLAGS_tuner_gen_restore_number);
-  if (restoredCandidates.size() > 0) {
-    startingPoints.reserve(1 + restoredCandidates.size());
-    std::move(
-        restoredCandidates.begin(),
-        restoredCandidates.end(),
-        std::back_inserter(startingPoints));
-  }
-
   // Create initial configs based on options + model configuration
+  const std::vector<typename Backend::MappingOptionsType> options{baseMappings};
   std::vector<TuningConfiguration> configs;
-  configs.reserve(startingPoints.size());
+  configs.reserve(options.size());
   std::transform(
-      startingPoints.begin(),
-      startingPoints.end(),
+      options.begin(),
+      options.end(),
       std::back_inserter(configs),
       [this, &fixedParams, &modelConfiguration](
           const typename Backend::MappingOptionsType& options) {
@@ -484,9 +429,9 @@ Autotuner<Backend, SearchStrategy>::tune(
       tcEntryPointMap.at(tcEntryPoint),
       inputs,
       outputs,
-      baseMapping,
+      options[0],
       fixedParams,
-      optionsCache_);
+      optionsCache);
 
   // Setup handlers
   sigterm_ = 0;
@@ -505,10 +450,6 @@ Autotuner<Backend, SearchStrategy>::tune(
     try {
       tuningHarness.run(searchStrategy);
     } catch (const std::exception& e) {
-      std::cerr << "Exception during autotuning: " << e.what()
-                << "\n dumping cache to "
-                << tc::makeOptionsFilename(cacheFileName) << std::endl;
-      storeTopKInCache<Backend>(optionsCache_, cacheFileName);
       tuningHarnessThreadEx = std::current_exception();
     }
     tuningHarnessFinished = true;
@@ -517,11 +458,9 @@ Autotuner<Backend, SearchStrategy>::tune(
     std::this_thread::sleep_for(std::chrono::milliseconds(100));
     if (sigint_) {
       tuningHarness.stopAfterCurrentIteration();
-      storeTopKInCache<Backend>(optionsCache_, cacheFileName);
     }
     if (sigterm_) {
       std::cerr << "Autotuning aborted." << std::endl;
-      storeTopKInCache<Backend>(optionsCache_, cacheFileName);
       std::abort();
     }
   }
@@ -532,8 +471,6 @@ Autotuner<Backend, SearchStrategy>::tune(
     std::rethrow_exception(tuningHarnessThreadEx);
   }
 
-  storeTopKInCache<Backend>(optionsCache_, cacheFileName);
-
   return {tuningHarness.bestMappingOptions()};
 }
 } // namespace autotune
 
@@ -164,12 +164,14 @@ class Autotuner {
       const std::unordered_map<size_t, std::vector<const DLConstTensor*>>&
           inputs,
       std::unordered_map<size_t, std::vector<const DLTensor*>>& outputs,
-      const MappingOptionsType& baseMapping,
-      const std::string& cacheFileName = "",
+      const std::vector<MappingOptionsType>& baseMapping,
       const TuningParameterFixer& fixedParams = TuningParameterFixer());
 
- private:
-  std::shared_ptr<OptionsCache<Backend>> optionsCache_;
+ public:
+  /// This is accessed by multiple threads in the tuning harness.
+  /// Even though manipulations are threadsafe, you want to be sure tuning
+  /// has finished before accessing the optionsCache.
+  std::shared_ptr<OptionsCache<Backend>> optionsCache;
 };
 
 /// Helper functions that need specializing for various backends.
 
@@ -27,6 +27,7 @@
 #include <llvm/ADT/Optional.h>
 
 #include "tc/core/check.h"
+#include "tc/core/compiler.h"
 #include "tc/core/tensor.h"
 #include "tc/core/utils/math.h"
 #include "tc/core/utils/time.h"
@@ -163,12 +164,10 @@ void OptionsCache<Backend>::storeCacheToFile(
     std::lock_guard<std::mutex> lock(mutex);
     std::fstream serialized(
         filename, std::ios::binary | std::ios::trunc | std::ios::out);
-    if (!serialized.is_open()) {
-      LOG(ERROR) << "Failed to open the output stream for dumping protobuf: "
-                 << filename;
-    } else {
-      proto.SerializePartialToOstream(&serialized);
-    }
+    TC_CHECK(serialized.is_open(), std::invalid_argument)
+        << "Failed to open the output stream for dumping protobuf: "
+        << filename;
+    proto.SerializePartialToOstream(&serialized);
   }
 }
 
@@ -317,9 +316,37 @@ void OptionsCache<Backend>::fromProtobuf(
   }
 }
 
-} // namespace autotune
+template <typename Backend>
+std::vector<typename Backend::MappingOptionsType> loadTopKFromCacheFile(
+    const std::string& tc,
+    const std::string& entryPoint,
+    const std::string& cacheFilename,
+    const std::vector<const DLConstTensor*>& inputs,
+    size_t count) {
+  OptionsCache<Backend> optionsCache;
+  optionsCache.loadCacheFromFile(cacheFilename);
+  auto outputs = tc::inferOutputTensorInfo(tc, entryPoint, inputs);
+  return optionsCache.getTopKOptions(
+      lang::canonicalTc(tc::detail::parse(tc).at(entryPoint)),
+      tc::makeTensorInfoVector(inputs),
+      outputs,
+      Backend::backendString(),
+      count);
+}
 
-inline std::string makeOptionsFilename(const std::string& fn) {
-  return fn + ".options";
+template <typename Backend>
+void appendTopKToCacheFile(
+    const std::shared_ptr<OptionsCache<Backend>>& cache,
+    const std::string& cacheFilename,
+    uint32_t count) {
+  OptionsCache<Backend> copy(*cache);
+  copy.pruneKeepTopK(count);
+  auto proto = copy.toProtobuf();
+  OptionsCache<Backend> optionsCache;
+  optionsCache.loadCacheFromFile(cacheFilename);
+  optionsCache.fromProtobuf(proto);
+  optionsCache.storeCacheToFile(cacheFilename);
 }
+
+} // namespace autotune
 } // namespace tc