Improve tclib::MappingOptions bindings

nicolasvasilache · nicolasvasilache · commit 3a36e6017463 · 2018-06-29T09:49:14.000-07:00
This commit adds the previously missing `usePrivateMemory` binding.
Additionally make each call on the python side return the `MappingOptions`
instance so the methods become nicely chainable on the python side.
diff --git a/python/examples/tc_pybind_example.py b/python/examples/tc_pybind_example.py
@@ -86,7 +86,7 @@ def matmul_grad(float(M,N) A, float(N,K) B, float(M,K) d_O) -> (d_A, d_B) {
 ################################################################################
 from tensor_comprehensions.tclib import compile
 
-executor = compile(mm, "matmul", (mat1, mat2), MappingOptions())
+executor = compile(mm, "matmul", (mat1, mat2), MappingOptions('naive'))
 outputs = executor.run((mat1, mat2), ())
 outputs = executor.unchecked_run((mat1, mat2), tuple(outputs))
 time_tc(100,
@@ -108,7 +108,7 @@ def matmul_grad(float(M,N) A, float(N,K) B, float(M,K) d_O) -> (d_A, d_B) {
 compilation_cache = CompilationCache(mm)
 # Compilation returns an allocated tuple of outputs with the proper shapes.
 # Allocation overhead is negligible compared to compilation overhead.
-compilation_cache.compile("matmul", (mat1, mat2), MappingOptions())
+compilation_cache.compile("matmul", (mat1, mat2), MappingOptions('naive'))
 # Run once without timing
 compilation_cache.unchecked_run("matmul", (mat1, mat2), ())
 # unchecked_run on  tensors
@@ -136,7 +136,7 @@ def matmul_grad(float(M,N) A, float(N,K) B, float(M,K) d_O) -> (d_A, d_B) {
 top1  = tuner.tune(
     "matmul",
     (mat1, mat2),
-    MappingOptions(),
+    MappingOptions('naive'),
     TunerConfig(threads = 8, pop_size = 25, generations = 3, devices = "0"))
 cache = MappingOptionsCache(unique_filename)
 top10 = cache.load(mm, "matmul", (mat1, mat2), 10)
@@ -213,7 +213,7 @@ def compileOrTune(self, name = "", force_reinforcement_tuning = False, inputs =
                           "########################################################")
 
                 if len(base_options_list) == 0:
-                    mapping_options = MappingOptions()
+                    mapping_options = MappingOptions('naive')
                 else:
                     mapping_options = base_options_list[0]
 
@@ -417,7 +417,7 @@ def compileOrTune(self, name = "", force_reinforcement_tuning = False, inputs =
                           "########################################################")
 
                 if len(base_options_list) == 0:
-                    mapping_options = MappingOptions()
+                    mapping_options = MappingOptions('naive')
                 else:
                     mapping_options = base_options_list[0]
 
diff --git a/tensor_comprehensions/pybinds/tclib.cc b/tensor_comprehensions/pybinds/tclib.cc
@@ -514,7 +514,11 @@ PYBIND11_MODULE(tclib, m) {
       "MappingOptions for a Tensor Comprehensions (TC)",
       py::module_local())
       .def(
-          py::init([]() {
+          py::init([](const std::string& optionsName) {
+            TC_CHECK_EQ(optionsName, "naive")
+                << "Naive options are the only constructible user-facing "
+                << "options. We recommended using the tuner to get better "
+                << "options or, alternatively, retrieving some from a cache.";
             return tc::CudaMappingOptions::makeNaiveMappingOptions();
           }),
           "Initialize naive CudaMappingOption")
@@ -543,6 +547,10 @@ PYBIND11_MODULE(tclib, m) {
           &tc::CudaMappingOptions::useSharedMemory,
           "Create block-local copies of data in shared memory when this can "
           "leverage data reuse or global memory access coalescing")
+      .def(
+          "usePrivateMemory",
+          &tc::CudaMappingOptions::usePrivateMemory,
+          "Create thread-local copies of data in private memory")
       .def(
           "unrollCopyShared",
           &tc::CudaMappingOptions::unrollCopyShared,
@@ -556,13 +564,15 @@ PYBIND11_MODULE(tclib, m) {
           "scheduleFusionStrategy",
           [](tc::CudaMappingOptions& instance, const std::string& type) {
             instance.scheduleFusionStrategy(type);
+            return instance;
           },
           "Set up outerScheduleFusionStrategy and intraTileFusionStrategy "
           "to the given value")
       .def(
           "outerScheduleFusionStrategy",
           [](tc::CudaMappingOptions& instance, const std::string& type) {
             instance.outerScheduleFusionStrategy(type);
+            return instance;
           },
           "Require TC to try and execute different TC expressions interleaved "
           "(Max), separately (Min)\n"
@@ -574,6 +584,7 @@ PYBIND11_MODULE(tclib, m) {
           "intraTileScheduleFusionStrategy",
           [](tc::CudaMappingOptions& instance, const std::string& type) {
             instance.intraTileScheduleFusionStrategy(type);
+            return instance;
           },
           "Require TC to try and execute different TC expressions interleaved "
           "(Max), separately (Min)\n"
@@ -584,7 +595,10 @@ PYBIND11_MODULE(tclib, m) {
           "tile",
           // pybind11 has implicit conversion from tuple -> vector
           [](tc::CudaMappingOptions& instance,
-             std::vector<uint64_t>& tileSizes) { instance.tile(tileSizes); },
+             std::vector<uint64_t>& tileSizes) {
+            instance.tile(tileSizes);
+            return instance;
+          },
           "Perform loop tiling on the generated code with the given sizes. "
           "Independent of mapping to a\n"
           "grid of thread blocks")
@@ -593,6 +607,7 @@ PYBIND11_MODULE(tclib, m) {
           [](tc::CudaMappingOptions& instance,
              std::vector<uint64_t>& threadSizes) {
             instance.mapToThreads(threadSizes);
+            return instance;
           },
           "The configuration of CUDA block, i.e. the number of CUDA threads "
           "in each block along three\n"
@@ -604,6 +619,7 @@ PYBIND11_MODULE(tclib, m) {
           [](tc::CudaMappingOptions& instance,
              std::vector<uint64_t>& blockSizes) {
             instance.mapToBlocks(blockSizes);
+            return instance;
           },
           "The configuration of CUDA grid, i.e. the number of CUDA blocks "
           "along three dimensions. Must be\n"
@@ -613,13 +629,15 @@ PYBIND11_MODULE(tclib, m) {
           "matchLibraryCalls",
           [](tc::CudaMappingOptions& instance, bool match) {
             instance.matchLibraryCalls(match);
+            return instance;
           },
           "Replace computation patterns with calls to highly optimized "
           "libraries (such as CUB, CUTLASS) when possible")
       .def(
           "fixParametersBeforeScheduling",
           [](tc::CudaMappingOptions& instance, bool fix) {
             instance.fixParametersBeforeScheduling(fix);
+            return instance;
           },
           "Perform automatic loop scheduling taking into account specific "
           "tensor sizes.\n"
@@ -631,6 +649,7 @@ PYBIND11_MODULE(tclib, m) {
           "unroll",
           [](tc::CudaMappingOptions& instance, uint64_t factor) {
             instance.unroll(factor);
+            return instance;
           },
           "Perform loop unrolling on the generated code and produce at "
           "most the given number of statements");