facebookresearch
diff --git a/‎.gitignore
Lines changed: 2 additions & 0 deletions b/‎.gitignore
Lines changed: 2 additions & 0 deletions
diff --git a/‎README.md
Lines changed: 1 addition & 1 deletion b/‎README.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎tc/CMakeLists.txt
Lines changed: 1 addition & 0 deletions b/‎tc/CMakeLists.txt
Lines changed: 1 addition & 0 deletions
diff --git a/‎tc/aten/CMakeLists.txt
Lines changed: 26 additions & 0 deletions b/‎tc/aten/CMakeLists.txt
Lines changed: 26 additions & 0 deletions
diff --git a/‎tc/aten/utils-inl.h renamed to ‎tc/aten/aten-inl.h
Lines changed: 25 additions & 20 deletions b/‎tc/aten/utils-inl.h renamed to ‎tc/aten/aten-inl.h
Lines changed: 25 additions & 20 deletions
diff --git a/‎tc/aten/utils.h renamed to ‎tc/aten/aten.h
Lines changed: 13 additions & 9 deletions b/‎tc/aten/utils.h renamed to ‎tc/aten/aten.h
Lines changed: 13 additions & 9 deletions
diff --git a/‎tc/aten/aten_autotuner-inl.h
Lines changed: 99 additions & 0 deletions b/‎tc/aten/aten_autotuner-inl.h
Lines changed: 99 additions & 0 deletions
diff --git a/‎tc/aten/aten_autotuner.h
Lines changed: 94 additions & 0 deletions b/‎tc/aten/aten_autotuner.h
Lines changed: 94 additions & 0 deletions
@@ -20,3 +20,5 @@ tensor_comprehensions/version.py
 tensor_comprehensions/*.proto
 slurm-*
 examples/results*
+*.pyc
+test_python/tc_test/*
@@ -41,7 +41,7 @@ def tensordot(float(N, C1, C2, H, W) I0,
     O(n, c1, c3, h, w) +=! I0(n, c1, r_c2, h, w) * I1(n, r_c2, c3, h, w)
 }
   )TC";
-  tc::ATenCompilationUnit<tc::CudaTcExecutor> atCompl;
+  tc::ATenCompilationUnit<tc::CudaBackend> atCompl;
   atCompl.define(tc);
 
   // 2. Allocate tensors with random data.
 
@@ -5,6 +5,7 @@ add_subdirectory(proto)
 add_subdirectory(version)
 add_subdirectory(core)
 add_subdirectory(autotuner)
+add_subdirectory(aten)
 
 if (WITH_CAFFE2 AND WITH_CUDA)
   add_subdirectory(c2)
 
@@ -0,0 +1,26 @@
+################################################################################
+# tc_aten
+#
+# Core CPU library with cross-compilation capabilities linked from
+# tc_aten
+################################################################################
+add_library(
+  tc_aten
+
+  SHARED
+
+  aten_compiler.cc
+)
+target_link_libraries(
+  tc_aten
+
+  ${HALIDE_LIBRARIES}
+
+  tc_core
+)
+install(
+  TARGETS
+  tc_aten
+
+  DESTINATION lib
+)
@@ -20,36 +20,41 @@
 
 #include <ATen/ATen.h>
 #include <ATen/DLConvertor.h>
+
+#include "tc/core/tensor.h"
+
 namespace tc {
-namespace {
-inline std::pair<std::vector<DLTensor*>, std::vector<DLManagedTensor*>>
-toDlpackTensors(const std::vector<at::Tensor>& tensors) {
-  std::vector<DLTensor*> dlTensors;
-  std::vector<DLManagedTensor*> dlMTensors;
+namespace aten {
+inline std::vector<DLTensorUPtr> makeDLTensors(
+    const std::vector<at::Tensor>& tensors) {
+  std::vector<DLTensorUPtr> dlTensors;
   for (auto tensor : tensors) {
     auto dlMTensor = at::toDLPack(tensor);
-    dlTensors.push_back(&(dlMTensor->dl_tensor));
-    dlMTensors.push_back(dlMTensor);
+    dlTensors.push_back(makeDLTensor(&(dlMTensor->dl_tensor)));
+    dlMTensor->deleter(dlMTensor);
   }
-  return make_pair(dlTensors, dlMTensors);
+  return dlTensors;
 }
 
-inline std::pair<std::vector<const DLTensor*>, std::vector<DLManagedTensor*>>
-toConstDlpackTensors(const std::vector<at::Tensor>& tensors) {
-  std::vector<const DLTensor*> dlTensors;
-  std::vector<DLManagedTensor*> dlMTensors;
+inline std::vector<DLConstTensorUPtr> makeDLConstTensors(
+    const std::vector<at::Tensor>& tensors) {
+  std::vector<DLConstTensorUPtr> dlTensors;
   for (auto tensor : tensors) {
     auto dlMTensor = at::toDLPack(tensor);
-    dlTensors.push_back(&(dlMTensor->dl_tensor));
-    dlMTensors.push_back(dlMTensor);
+    dlTensors.push_back(makeDLConstTensor(&(dlMTensor->dl_tensor)));
+    dlMTensor->deleter(dlMTensor);
   }
-  return make_pair(dlTensors, dlMTensors);
+  return dlTensors;
 }
 
-inline void deleteDlmTensors(std::vector<DLManagedTensor*>& tensors) {
-  for (auto& tensor : tensors) {
-    tensor->deleter(tensor);
-  }
+inline void setAtenSeed(uint64_t seed, at::Backend backend) {
+  at::Generator& gen = at::globalContext().defaultGenerator(backend);
+  gen.manualSeed(seed);
+}
+
+inline uint64_t getAtenSeed(at::Backend backend) {
+  at::Generator& gen = at::globalContext().defaultGenerator(backend);
+  return gen.seed();
 }
-} // namespace
+} // namespace aten
 } // namespace tc
@@ -19,18 +19,22 @@
 #include <vector>
 
 #include <ATen/ATen.h>
-#include <ATen/DLConvertor.h>
+
+#include "tc/core/tensor.h"
 
 namespace tc {
-namespace {
-std::pair<std::vector<DLTensor*>, std::vector<DLManagedTensor*>>
-toDlpackTensors(const std::vector<at::Tensor>& tensors);
+namespace aten {
+
+inline std::vector<DLTensorUPtr> makeDLTensors(
+    const std::vector<at::Tensor>& tensors);
+
+inline std::vector<DLConstTensorUPtr> makeDLConstTensors(
+    const std::vector<at::Tensor>& tensors);
 
-std::pair<std::vector<const DLTensor*>, std::vector<DLManagedTensor*>>
-toConstDlpackTensors(const std::vector<at::Tensor>& tensors);
+inline void setAtenSeed(uint64_t seed, at::Backend backend);
+inline uint64_t getAtenSeed(at::Backend backend);
 
-void deleteDlmTensors(std::vector<DLManagedTensor*>& tensors);
-} // namespace
+} // namespace aten
 } // namespace tc
 
-#include "tc/aten/utils-inl.h"
+#include "tc/aten/aten-inl.h"
@@ -0,0 +1,99 @@
+/**
+ * Copyright (c) 2017-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "tc/autotuner/autotuner.h"
+
+#include <atomic>
+#include <chrono>
+#include <numeric>
+#include <thread>
+
+#include <glog/stl_logging.h>
+
+#include "tc/aten/aten.h"
+#include "tc/aten/aten_compiler.h"
+#include "tc/autotuner/utils.h"
+#include "tc/core/compiler.h"
+#include "tc/core/flags.h"
+#include "tc/core/scope_guard.h"
+#include "tc/core/tensor.h"
+#include "tc/core/utils/math.h"
+#include "tc/lang/canonicalize.h"
+
+namespace tc {
+namespace aten {
+template <typename Backend, typename Search>
+ATenAutotuner<Backend, Search>::ATenAutotuner(const std::string& tc)
+    : tc_(tc) {}
+
+std::vector<at::Tensor> cloneTensors(const std::vector<at::Tensor>& inputs) {
+  std::vector<at::Tensor> copies;
+  copies.reserve(inputs.size());
+  for (const auto& t : inputs) {
+    copies.push_back(t.clone());
+  }
+  return copies;
+}
+
+template <typename Backend, typename Search>
+std::vector<typename Backend::MappingOptionsType>
+ATenAutotuner<Backend, Search>::tune(
+    const std::string& tcName,
+    const std::vector<at::Tensor>& inputs,
+    const typename Backend::MappingOptionsType& baseMapping,
+    const std::string& cacheFileName,
+    const tc::autotune::TuningParameterFixer& fixedParams) {
+  // TODO: some checks that inputs memory lives on the proper Backend device
+
+  // prepare outputs of the proper shape
+  auto outputs = tc::aten::prepareOutputs(tc_, tcName, inputs);
+
+  // first parse the devices
+  auto devices =
+      tc::autotune::detail::parseDevices<Backend>(FLAGS_tuner_devices);
+  // clone the inputs/outputs on each device
+  // TODO: this takes twice the space it should, alternatives are:
+  // 1. enforce inputs and outputs live on the CPU in the first place so we
+  //    don't spuriously run out of device memory (assuming CPU memory is
+  //    infinite for now);
+  // 2. if 1. is not reasonable, detect the device on which each tensor lives
+  //    and point to the raw data for that (device, tensor) pair.
+  std::unordered_map<size_t, std::vector<DLConstTensorUPtr>> inputsPerDevice;
+  std::unordered_map<size_t, std::vector<const DLConstTensor*>>
+      rawInputsPerDevice;
+  std::unordered_map<size_t, std::vector<DLTensorUPtr>> outputsPerDevice;
+  std::unordered_map<size_t, std::vector<const DLTensor*>> rawOutputsPerDevice;
+  for (auto device : devices) {
+    typename Backend::WithDevice wd(device);
+    auto deviceInputs = cloneTensors(inputs);
+    inputsPerDevice.emplace(device, makeDLConstTensors(deviceInputs));
+    rawInputsPerDevice.emplace(
+        device, extractRawPtrs(inputsPerDevice.at(device)));
+    auto deviceOutputs = cloneTensors(outputs);
+    outputsPerDevice.emplace(device, makeDLTensors(deviceOutputs));
+    rawOutputsPerDevice.emplace(
+        device, extractRawPtrs(outputsPerDevice.at(device)));
+  }
+  return tc::autotune::Autotuner<Backend, Search>::tune(
+      tc_,
+      tcName,
+      rawInputsPerDevice,
+      rawOutputsPerDevice,
+      baseMapping,
+      cacheFileName,
+      fixedParams);
+}
+} // namespace aten
+} // namespace tc
@@ -0,0 +1,94 @@
+/**
+ * Copyright (c) 2017-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "tc/aten/aten.h"
+#include "tc/autotuner/autotuner.h"
+
+namespace tc {
+namespace aten {
+/**
+ * An Autotuner provides the basic interface to run a SearchStrategy over a
+ * particular Backend.
+ *
+ * Possible usage:
+ *    using namespace tc::aten;
+ *    std::string tc("...");
+ *    ATenAutotuner<tc::CudaBackend, tc::autotune::GeneticSearch> tuner(tc);
+ *    std::string cacheFn("/tmp/some_file");
+ *    auto best = tuner.tune("tc_function_name", inputs, baseOption, cacheFn)
+ *
+ * The best options may then be used to compile an executor and run.
+ *    CHECK_GT(best.size(), 0);
+ *    auto pExecutor = compile(tc, "tc_function_name", inputs, best[0]);
+ *    auto outputs = prepareOutputs(tc, "tc_function_name", inputs);
+ *    // memoize the executor and outputs if needed
+ *    run(*pExecutor, inputs, outputs);
+ */
+template <typename Backend, typename SearchStrategy>
+class ATenAutotuner : public tc::autotune::Autotuner<Backend, SearchStrategy> {
+ public:
+  using BackendType = Backend;
+  using MappingOptionsType = typename BackendType::MappingOptionsType;
+
+  /// An ATenAutotuner is built from a TC string which contains multiple TC
+  /// functions on which tuning can be run independently.
+  ATenAutotuner(const std::string& tc);
+
+  /// Runs autotuning on the TC function tcEntryPoint.
+  /// Proper output shapes are inferred automatically from the input shapes.
+  ///
+  /// Optionally an OptionsCache cacheFileName serialized path
+  /// can be specified to which the tuner will save the best options found for
+  /// later offline reuse, in the proper protobuf format.
+  ///
+  /// Additionally, if such a cacheFileName is specified and if it contains a
+  /// previously saved protobuf then the autotuner will load it. In that case
+  /// the tuner recovers multiple starting points and appends them to the
+  /// baseMapping. This can be useful in a reinforcement situation where short
+  /// tunings are run and their results cached iteratively. The best options
+  /// are still saved at the end of tuning, possibly overwriting that
+  /// previously saved protobuf file.
+  ///
+  /// Lastly a TuningParameterFixer function can be specified to limit the
+  /// search space (i.e. when certain parameters are known to be good/bad
+  /// independently on a particular TC).
+  ///
+  /// \return a vector MappingOptions, if it is empty then tuning did not find
+  /// a single good configuration. This should be a very rare occurrence but
+  /// it is possible in particular if the skipExecutionOrWarmup function is too
+  /// aggressive and the problem size is too small. If the vector is not empty
+  /// it contains the best performing options for the particular Backend,
+  /// ranked by execution speed, where result[0] is the fastest.
+  std::vector<MappingOptionsType> tune(
+      const std::string& tcEntryPoint,
+      const std::vector<at::Tensor>& inputs,
+      const MappingOptionsType& baseMapping,
+      const std::string& cacheFileName = "",
+      const tc::autotune::TuningParameterFixer& fixedParams = {});
+
+ protected:
+  /// The TC string is stored internally so we can tune independent TC
+  /// functions on demand.
+  const std::string tc_;
+};
+} // namespace aten
+} // namespace tc
+
+#include "tc/aten/aten_autotuner-inl.h"
Original file line number	Diff line number	Diff line change
`@@ -41,7 +41,7 @@ def tensordot(float(N, C1, C2, H, W) I0,`
`41`	`41`	`O(n, c1, c3, h, w) +=! I0(n, c1, r_c2, h, w) * I1(n, r_c2, c3, h, w)`
`42`	`42`	`}`
`43`	`43`	`)TC";`
`44`		`- tc::ATenCompilationUnit<tc::CudaTcExecutor> atCompl;`
	`44`	`+ tc::ATenCompilationUnit<tc::CudaBackend> atCompl;`
`45`	`45`	`atCompl.define(tc);`
`46`	`46`
`47`	`47`	`// 2. Allocate tensors with random data.`