Add simpler pybind entry point to TC

nicolasvasilache · nicolasvasilache · commit 11b0c315ce92 · 2018-06-13T16:01:29.000-06:00
If the python user can memoize the executor resulting from
compilation then this API should be preferred. Otherwise,
like in PyTorch autograd functions, the compilation cache
should be used.
diff --git a/python/examples/tc_pybind_example.py b/python/examples/tc_pybind_example.py
@@ -17,6 +17,13 @@
 
 dump_backward_overhead = False
 
+################################################################################
+# The purpose of these examples is to demonstrate the usage of the python
+# bindings to build a simple, low-overhead, python abstraction.
+# We demonstrate the bnidings by building a series of examples leading to a
+# MultiTcFunction abstraction for PyTorch autograd.
+################################################################################
+
 ################################################################################
 # 0. Initializations
 ################################################################################
@@ -33,7 +40,7 @@ def time_tc(iters, prepend, runFun, tc_name, inputs):
         start = time.clock()
         if dump_backward_overhead:
             dump_backward_overhead = time.clock()
-        outputs = runFun(tc_name, inputs, ())
+        outputs = runFun(tc_name, inputs)
         timesCPU.append(time.clock() - start)
         torch.cuda.synchronize()
         timesCPUAndGPU.append(time.clock() - start)
@@ -68,23 +75,51 @@ def matmul_grad(float(M,N) A, float(N,K) B, float(M,K) d_O) -> (d_A, d_B) {
 mat1, mat2 = torch.randn(300, 400).cuda(), torch.randn(400, 500).cuda()
 
 ################################################################################
-# 1. Use the C++ API to build a low-overhead compilation cache and time it
+# 1. Use the simple high-overhead compile/run C++ API
+#    If one can keep state in their layer or wishes to experiment with TC,
+#    this is a simple entry point.
+#    If state cannot be kept, be aware that this API has a non-trivial overhead
+#    when outputs sizes need to be inferred and outputs allocated.
+#    Compilation itself has a prohibitive cost and needs to be memoized either
+#    by holding on to the executor or by using the low-overhead abstraction, see
+#    below
+################################################################################
+from tensor_comprehensions.tclib import compile
+
+executor = compile(mm, "matmul", (mat1, mat2), MappingOptions())
+outputs = executor.run((mat1, mat2), ())
+outputs = executor.unchecked_run((mat1, mat2), tuple(outputs))
+time_tc(100,
+        "simple API\t",
+        lambda name, ins: executor.unchecked_run(ins, tuple(outputs)),
+        "matmul",
+        (mat1, mat2))
+time_tc(100,
+        "simple API (with allocation overhead)\t",
+        lambda name, ins: executor.unchecked_run(ins, ()),
+        "matmul",
+        (mat1, mat2))
+
+################################################################################
+# 2. Use the C++ API to build a low-overhead compilation cache and time it
 ################################################################################
 from tensor_comprehensions.tclib import CompilationCache
 
 compilation_cache = CompilationCache(mm)
 # Compilation returns an allocated tuple of outputs with the proper shapes.
 # Allocation overhead is negligible compared to compilation overhead.
 compilation_cache.compile("matmul", (mat1, mat2), MappingOptions())
+# Run once without timing
+compilation_cache.unchecked_run("matmul", (mat1, mat2), ())
 # unchecked_run on  tensors
 time_tc(100,
         "raw unchecked_run naive options\t",
-        lambda name, ins, outs: compilation_cache.unchecked_run(name, ins, outs),
+        lambda name, ins: compilation_cache.unchecked_run(name, ins, ()),
         "matmul",
         (mat1, mat2))
 
 ################################################################################
-# 2. Short tuning run saving to file then load the best option to create a
+# 3. Short tuning run saving to file then load the best option to create a
 #    compilation cache
 ################################################################################
 from tensor_comprehensions.tclib import Tuner
@@ -111,12 +146,12 @@ def matmul_grad(float(M,N) A, float(N,K) B, float(M,K) d_O) -> (d_A, d_B) {
 compilation_cache.compile("matmul", (mat1, mat2), top1)
 time_tc(100,
         "raw unchecked_run tuned options\t",
-        lambda name, ins, outs: compilation_cache.unchecked_run(name, ins, outs),
+        lambda name, ins: compilation_cache.unchecked_run(name, ins, ()),
         "matmul",
         (mat1, mat2))
 
 ################################################################################
-# 3. Simple TC builder
+# 4. Simple TC builder
 ################################################################################
 class TcBuilder():
     def __init__(self,
@@ -200,12 +235,12 @@ def compileOrTune(self, name = "", force_reinforcement_tuning = False, inputs =
 tcb.compileOrTune(name = "matmul", inputs = (mat1, mat2))
 time_tc(100,
         "TcBuilder unchecked_run\t",
-        lambda name, ins, outs: tcb.compilation_cache.unchecked_run(name, ins, outs),
+        lambda name, ins: tcb.compilation_cache.unchecked_run(name, ins, ()),
         "matmul",
         (mat1, mat2))
 
 ################################################################################
-# 4. Simple torch.autograd.Function backed by TcBuilder
+# 5. Simple torch.autograd.Function backed by TcBuilder
 ################################################################################
 class TcFunction(torch.autograd.Function):
     @staticmethod
@@ -283,7 +318,7 @@ def backward(ctx, *gradients):
 
 time_tc(100,
         "TcFunction forward unchecked_run\t",
-        lambda name, ins, outs: TcFunction.apply(tcb, *ins),
+        lambda name, ins: TcFunction.apply(tcb, *ins),
         "matmul",
         (mat1, mat2))
 
@@ -306,7 +341,7 @@ def backward(ctx, *gradients):
 dump_backward_overhead = False
 time_tc(100,
         "TcFunction backward unchecked_run\t",
-        lambda name, ins, outs: outputs[0].backward(grad_sized_tensor, retain_graph = True),
+        lambda name, ins: outputs[0].backward(grad_sized_tensor, retain_graph = True),
         "matmul",
         (mat1, mat2))
 
@@ -316,7 +351,7 @@ def backward(ctx, *gradients):
 v.backward(retain_graph = True)
 
 ################################################################################
-# 5. Multi-TC builder
+# 6. Multi-TC builder
 ################################################################################
 class MultiTcBuilder():
     def __init__(self,
@@ -404,12 +439,12 @@ def compileOrTune(self, name = "", force_reinforcement_tuning = False, inputs =
 tcb.compileOrTune(name = "matmul", inputs = (mat1, mat2))
 time_tc(100,
         "MultiTcBuilder unchecked_run\t",
-        lambda name, ins, outs: tcb.compilation_cache.unchecked_run(name, ins, outs),
+        lambda name, ins: tcb.compilation_cache.unchecked_run(name, ins, ()),
         "matmul",
         (mat1, mat2))
 
 ################################################################################
-# 6. Multi-TC torch.autograd.Function backed by MultiTcBuilder
+# 7. Multi-TC torch.autograd.Function backed by MultiTcBuilder
 ################################################################################
 class MultiTcFunction(torch.autograd.Function):
     @staticmethod
@@ -508,7 +543,7 @@ def backward(ctx, *gradients):
 
 time_tc(100,
         "MultiTcFunction forward unchecked_run\t",
-        lambda name, ins, outs: MultiTcFunction.apply(tcb, *ins),
+        lambda name, ins: MultiTcFunction.apply(tcb, *ins),
         "matmul",
         (mat1, mat2))
 
@@ -531,7 +566,7 @@ def backward(ctx, *gradients):
 dump_backward_overhead = False
 time_tc(100,
         "MultiTcFunction backward unchecked_run\t",
-        lambda name, ins, outs: outputs[0].backward(grad_sized_tensor, retain_graph = True),
+        lambda name, ins: outputs[0].backward(grad_sized_tensor, retain_graph = True),
         "matmul",
         (mat1, mat2))
 
diff --git a/tensor_comprehensions/pybinds/tclib.cc b/tensor_comprehensions/pybinds/tclib.cc
@@ -236,6 +236,42 @@ class Tuner : public ATenCudaGeneticTuner {
   std::string cacheFileName;
 };
 
+struct TcExecutor {
+  py::list run(
+      const py::tuple& inputs,
+      const py::tuple& outputs = py::tuple()) {
+    if (outputs.size() > 0) {
+      auto atOutputs = getATenTensors(outputs);
+      auto atInputs = getATenTensors(inputs);
+      tc::aten::run(*executor, atInputs, atOutputs);
+      return py::list(outputs);
+    } else {
+      auto atInputs = getATenTensors(inputs);
+      auto atOutputs = tc::aten::prepareOutputs(tc, entryPoint, atInputs);
+      tc::aten::run(*executor, atInputs, atOutputs);
+      return convertToPyObjects(atOutputs);
+    }
+  }
+  py::list uncheckedRun(
+      const py::tuple& inputs,
+      const py::tuple& outputs = py::tuple()) {
+    if (outputs.size() > 0) {
+      auto atOutputs = getATenTensors(outputs);
+      auto atInputs = getATenTensors(inputs);
+      tc::aten::uncheckedRun(*executor, atInputs, atOutputs);
+      return py::list(outputs);
+    } else {
+      auto atInputs = getATenTensors(inputs);
+      auto atOutputs = tc::aten::prepareOutputs(tc, entryPoint, atInputs);
+      tc::aten::uncheckedRun(*executor, atInputs, atOutputs);
+      return convertToPyObjects(atOutputs);
+    }
+  }
+  std::string tc;
+  std::string entryPoint;
+  std::unique_ptr<tc::CudaBackend::ExecutorType> executor;
+};
+
 class TunerConfig {
  public:
   TunerConfig(
@@ -345,6 +381,20 @@ PYBIND11_MODULE(tclib, m) {
   m.def(
       "set_dump_cuda", [](bool dump_cuda) { tc::FLAGS_dump_cuda = dump_cuda; });
 
+  py::class_<TcExecutor>(m, "TcExecutor", py::module_local())
+      .def("run", &TcExecutor::run)
+      .def("unchecked_run", &TcExecutor::uncheckedRun);
+  m.def(
+      "compile",
+      [](const std::string& tc,
+         const std::string& entryPoint,
+         const py::tuple& inputs,
+         const tc::CudaMappingOptions& options) {
+        auto execUPtr = tc::aten::compile<tc::CudaBackend>(
+            tc, entryPoint, getATenTensors(inputs), options);
+        return TcExecutor{tc, entryPoint, std::move(execUPtr)};
+      });
+
   py::class_<TunerConfig>(m, "TunerConfig", py::module_local())
       .def(
           py::init<uint32_t, uint32_t, uint32_t, std::string, bool, uint32_t>(),