Generate PTX with NVCC

nicolasvasilache · ftynse · commit ecd85a193b10 · 2018-07-24T18:48:50.000+02:00
This commit allows using NVCC to emit PTX.
diff --git a/python/tests/test_tc.py b/python/tests/test_tc.py
@@ -84,6 +84,24 @@ def test_tc_llvm(self):
             'naive',
             A, B,
         )
+        # Reset the cuda compiler back to nvrtc
+        tc.cuda_compiler('nvrtc')
+        C = add(A, B)
+        tc.assert_almost_equal(C, torch.add(A, B), A, B)
+
+    #
+    # Simple TC example with explicit 'naive' compilation with nvcc
+    #
+    def test_tc_nvcc(self):
+        A, B = torch.randn(100, device='cuda'), torch.randn(100, device='cuda')
+        tc.cuda_compiler('nvcc')
+        add = tc.compile(
+            "def add(float(N) A, float(N) B) -> (C) { C(i) = A(i) + B(i) }",
+            "add",
+            'naive',
+            A, B,
+        )
+        # Reset the cuda compiler back to nvrtc
         tc.cuda_compiler('nvrtc')
         C = add(A, B)
         tc.assert_almost_equal(C, torch.add(A, B), A, B)
diff --git a/tc/core/cuda/cuda_rtc.cc b/tc/core/cuda/cuda_rtc.cc
@@ -138,6 +138,41 @@ static std::string llvmCompile(
       std::istreambuf_iterator<char>());
 }
 
+static std::string nvccCompile(
+    const std::string& name,
+    const std::string& source) {
+  int device, major, minor;
+  std::tie(device, major, minor) = getCudaArchitecture();
+
+  std::string pat("/tmp/cudaXXXXXX");
+  std::vector<char> ifn(pat.begin(), pat.end());
+  TC_CHECK_GE(mkstemp(ifn.data()), 0); // string.c_str is const char*
+  std::string inputFileName(ifn.begin(), ifn.end());
+  // cstdio's std::remove to delete files
+  tc::ScopeGuard sgi([&]() { std::remove(inputFileName.c_str()); });
+  {
+    std::ofstream ostream(inputFileName, std::ios::binary);
+    ostream << source;
+  }
+
+  std::string arch = "sm_" + std::to_string(major) + std::to_string(minor);
+  std::string outputPtxFile = inputFileName + ".ptx";
+  // cstdio's std::remove to delete files
+  tc::ScopeGuard sgo([&]() { std::remove(outputPtxFile.c_str()); });
+
+  std::string cmdPtx = std::string(TC_STRINGIFY(TC_CUDA_TOOLKIT_ROOT_DIR)) +
+      "/bin/nvcc -x cu " + inputFileName + " --gpu-architecture=" + arch + " " +
+      "--ptx " + "-I" + TC_STRINGIFY(TC_CUDA_INCLUDE_DIR) + " " + "-I" +
+      TC_STRINGIFY(TC_CUB_INCLUDE_DIR) + " " + tc::FLAGS_nvcc_flags + " -o " +
+      outputPtxFile;
+  TC_CHECK_EQ(std::system(cmdPtx.c_str()), 0) << cmdPtx;
+
+  std::ifstream stream(outputPtxFile);
+  return std::string(
+      (std::istreambuf_iterator<char>(stream)),
+      std::istreambuf_iterator<char>());
+}
+
 static std::string nvrtcCompile(
     const std::string& name,
     const std::string& source) {
@@ -209,8 +244,7 @@ std::unique_ptr<CudaRTCFunction> CudaRTCFunction::Compile(
   } else if (FLAGS_cuda_compiler == "llvm") {
     res->ptx = llvmCompile(name, source);
   } else if (FLAGS_cuda_compiler == "nvcc") {
-    CHECK(false) << "NYI";
-    // res->ptx = llvmCompile(name, source);
+    res->ptx = nvccCompile(name, source);
   } else {
     CHECK(false) << "Unknown CUDA compiler: " << FLAGS_cuda_compiler;
   }
diff --git a/tc/core/flags.cc b/tc/core/flags.cc
@@ -47,6 +47,10 @@ DEFINE_string(
     llvm_flags,
     "-std=c++11 -O3 -ffast-math",
     "compiler flags to set when llvm is used");
+DEFINE_string(
+    nvcc_flags,
+    "-std=c++11 -ptx -DNVRTC_CUB=1 --use_fast_math",
+    "compiler flags to set when nvcc is used");
 
 // CPU codegen options
 DEFINE_bool(llvm_dump_before_opt, false, "Print IR before optimization");
diff --git a/tc/core/flags.h b/tc/core/flags.h
@@ -34,6 +34,7 @@ DECLARE_bool(dump_ptx);
 // ptx generation
 DECLARE_string(cuda_compiler);
 DECLARE_string(llvm_flags);
+DECLARE_string(nvcc_flags);
 
 // llvm codegen
 DECLARE_bool(llvm_dump_before_opt);
diff --git a/tensor_comprehensions/__init__.py b/tensor_comprehensions/__init__.py
@@ -31,6 +31,7 @@
 from tensor_comprehensions.tclib import dump_ptx
 from tensor_comprehensions.tclib import cuda_compiler
 from tensor_comprehensions.tclib import llvm_flags
+from tensor_comprehensions.tclib import nvcc_flags
 
 from tensor_comprehensions.tclib import CompilationCache
 from tensor_comprehensions.tclib import MappingOptions
@@ -610,6 +611,7 @@ def make_autograd(forward_fun: Callable[[Iterable[torch.Tensor]], Iterable[torch
     'dump_ptx',
     'cuda_compiler',
     'llvm_flags',
+    'nvcc_flags',
     # Functions exposed by the tclib
     'compile',
     'autotune',
diff --git a/tensor_comprehensions/pybinds/tclib.cc b/tensor_comprehensions/pybinds/tclib.cc
@@ -452,6 +452,11 @@ PYBIND11_MODULE(tclib, m) {
       [](const std::string& llvm_flags) { tc::FLAGS_llvm_flags = llvm_flags; },
       gflags::DescribeOneFlag(gflags::GetCommandLineFlagInfoOrDie("llvm_flags"))
           .c_str());
+  m.def(
+      "nvcc_flags",
+      [](const std::string& nvcc_flags) { tc::FLAGS_nvcc_flags = nvcc_flags; },
+      gflags::DescribeOneFlag(gflags::GetCommandLineFlagInfoOrDie("nvcc_flags"))
+          .c_str());
 
   // Access the names of the defs in a TC string
   m.def("parse_defs", [](const std::string& tc) {