Generate PTX with LLVM trunk

nicolasvasilache · ftynse · commit 052a3efb7b81 · 2018-07-24T18:48:49.000+02:00
This commit uses trunk clang, llvm-link, opt and llc to emit PTX.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -258,6 +258,8 @@ include(cmake/GetGitRevisionDescription.cmake)
 ################################################################################
 # Variables for tc_config.h.in
 set(TC_DIR ${TC_DIR})
+execute_process(COMMAND ${CLANG_PREFIX}/bin/llvm-config --bindir  OUTPUT_VARIABLE LLVM_BIN_DIR OUTPUT_STRIP_TRAILING_WHITESPACE)
+set(TC_LLVM_BIN_DIR ${LLVM_BIN_DIR})
 if (WITH_CUDA)
    # CUDA-specific variables for tc_config.h.in
    set(TC_WITH_CUDA 1)
diff --git a/python/tests/test_tc.py b/python/tests/test_tc.py
@@ -59,7 +59,7 @@ def test_mapping_options(self):
             .outerScheduleFusionStrategy("Preserve3Coincident"))
 
     #
-    # Simple TC example with explicit 'naive' compilation
+    # Simple TC example with explicit 'naive' compilation with nvrtc (default)
     #
     def test_tc(self):
         A, B = torch.randn(100, device='cuda'), torch.randn(100, device='cuda')
@@ -72,6 +72,22 @@ def test_tc(self):
         C = add(A, B)
         tc.assert_almost_equal(C, torch.add(A, B), A, B)
 
+    #
+    # Simple TC example with explicit 'naive' compilation with llvm
+    #
+    def test_tc_llvm(self):
+        A, B = torch.randn(100, device='cuda'), torch.randn(100, device='cuda')
+        tc.cuda_compiler('llvm')
+        add = tc.compile(
+            "def add(float(N) A, float(N) B) -> (C) { C(i) = A(i) + B(i) }",
+            "add",
+            'naive',
+            A, B,
+        )
+        tc.cuda_compiler('nvrtc')
+        C = add(A, B)
+        tc.assert_almost_equal(C, torch.add(A, B), A, B)
+
     #
     # Simple TC example without fallback but with tuning starting from
     # MappingOptions('naive')
diff --git a/tc/core/cuda/cuda_libraries.h b/tc/core/cuda/cuda_libraries.h
@@ -43,8 +43,9 @@ constexpr auto defines = R"C(
 
 constexpr auto warpSyncFunctions = R"C(
 // Before CUDA 9, syncwarp is a noop since warps are always synchronized.
-#if __CUDACC_VER_MAJOR__ < 9
-__device__ void __syncwarp(unsigned mask = 0xFFFFFFFF) {}
+#if (!defined(__clang__) && __CUDACC_VER_MAJOR__ < 9) || \
+    ( defined(__clang__) && CUDA_VERSION < 9000)
+inline __device__ void __syncwarp(unsigned mask = 0xFFFFFFFF) {}
 #endif
 )C";
 
diff --git a/tc/core/cuda/cuda_rtc.cc b/tc/core/cuda/cuda_rtc.cc
@@ -13,6 +13,10 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#include <cstdio>
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
 #include <sstream>
 #include <string>
 #include <vector>
@@ -60,30 +64,89 @@ void checkOrCreateContext() {
   }
 }
 
-std::unique_ptr<CudaRTCFunction> CudaRTCFunction::Compile(
-    const std::string& name,
-    const std::string& source) {
-  std::unique_ptr<CudaRTCFunction> res(new CudaRTCFunction());
-  res->specializedName = name;
-  res->cleared_ = false;
-
-  if (FLAGS_debug_tc_mapper) {
-    LOG(INFO) << "NVRTC function source:\n" << source;
-  }
-  // Actually do the compiling.
-  nvrtcProgram prog;
-  TC_NVRTC_CHECK(
-      nvrtcCreateProgram(&prog, source.c_str(), nullptr, 0, nullptr, nullptr));
-
-  // Get the architecture of the current device.
-  int device, minor, major;
+namespace {
+static std::tuple<int, int, int> getCudaArchitecture() {
+  int device, major, minor;
   CUdevice deviceHandle;
   TC_CUDA_RUNTIMEAPI_ENFORCE(cudaGetDevice(&device));
   TC_CUDA_DRIVERAPI_ENFORCE(cuDeviceGet(&deviceHandle, device));
   TC_CUDA_DRIVERAPI_ENFORCE(cuDeviceGetAttribute(
       &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, deviceHandle));
   TC_CUDA_DRIVERAPI_ENFORCE(cuDeviceGetAttribute(
       &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, deviceHandle));
+  return std::tuple<int, int, int>(device, major, minor);
+}
+
+static std::string llvmCompile(
+    const std::string& name,
+    const std::string& source) {
+  int device, major, minor;
+  std::tie(device, major, minor) = getCudaArchitecture();
+
+  std::string pat("/tmp/cudaXXXXXX");
+  std::vector<char> ifn(pat.begin(), pat.end());
+  TC_CHECK_GE(mkstemp(ifn.data()), 0); // string.c_str is const char*
+  std::string inputFileName(ifn.begin(), ifn.end());
+  // cstdio's std::remove to delete files
+  tc::ScopeGuard sgi([&]() { std::remove(inputFileName.c_str()); });
+  {
+    std::ofstream ostream(inputFileName, std::ios::binary);
+    ostream << source;
+  }
+
+  std::string arch = "sm_" + std::to_string(major) + std::to_string(minor);
+  std::string outputClangFile = inputFileName + "-clang.ll";
+  std::string outputLinkFile = inputFileName + "-link.ll";
+  std::string outputOptFile = inputFileName + "-opt.ll";
+  std::string outputPtxFile = inputFileName + ".s";
+  tc::ScopeGuard sgo([&]() {
+    // cstdio's std::remove to delete files
+    std::remove(outputClangFile.c_str());
+    std::remove(outputLinkFile.c_str());
+    std::remove(outputOptFile.c_str());
+    std::remove(outputPtxFile.c_str());
+  });
+
+  std::string cmdLlvmIr = std::string(TC_STRINGIFY(TC_LLVM_BIN_DIR)) +
+      "/clang++ -x cuda " + inputFileName + " " + "--cuda-device-only " +
+      "--cuda-gpu-arch=" + arch + " " +
+      "--cuda-path=" + TC_STRINGIFY(TC_CUDA_TOOLKIT_ROOT_DIR) + " " + "-I" +
+      TC_STRINGIFY(TC_CUDA_INCLUDE_DIR) + " " + "-I" +
+      TC_STRINGIFY(TC_CUB_INCLUDE_DIR) + " " + tc::FLAGS_llvm_flags +
+      "  -DNVRTC_CUB=1 " + "-nocudalib -S -emit-llvm " + "-o " +
+      outputClangFile;
+  TC_CHECK_EQ(std::system(cmdLlvmIr.c_str()), 0) << cmdLlvmIr;
+
+  std::string cmdLlvmLink = std::string(TC_STRINGIFY(TC_LLVM_BIN_DIR)) +
+      "/llvm-link " + outputClangFile + " " +
+      TC_STRINGIFY(TC_CUDA_TOOLKIT_ROOT_DIR) +
+      "/nvvm/libdevice/libdevice.*.bc " + "-S -o " + outputLinkFile;
+  TC_CHECK_EQ(std::system(cmdLlvmLink.c_str()), 0) << cmdLlvmLink;
+
+  std::string cmdOpt = std::string(TC_STRINGIFY(TC_LLVM_BIN_DIR)) + "/opt " +
+      "-internalize -internalize-public-api-list=" + name + " " +
+      "-nvvm-reflect -O3 " + outputLinkFile + " -S -o " + outputOptFile;
+  TC_CHECK_EQ(std::system(cmdOpt.c_str()), 0) << cmdOpt;
+
+  std::string cmdPtx = std::string(TC_STRINGIFY(TC_LLVM_BIN_DIR)) +
+      "/llc -mcpu=" + arch + " " + outputOptFile + " -o " + outputPtxFile;
+  TC_CHECK_EQ(std::system(cmdPtx.c_str()), 0) << cmdPtx;
+
+  std::ifstream stream(outputPtxFile);
+  return std::string(
+      (std::istreambuf_iterator<char>(stream)),
+      std::istreambuf_iterator<char>());
+}
+
+static std::string nvrtcCompile(
+    const std::string& name,
+    const std::string& source) {
+  int device, major, minor;
+  std::tie(device, major, minor) = getCudaArchitecture();
+
+  nvrtcProgram prog;
+  TC_NVRTC_CHECK(
+      nvrtcCreateProgram(&prog, source.c_str(), nullptr, 0, nullptr, nullptr));
 
   std::stringstream arch_param;
   arch_param << "--gpu-architecture=compute_" << major << minor;
@@ -125,14 +188,38 @@ std::unique_ptr<CudaRTCFunction> CudaRTCFunction::Compile(
   }
   size_t ptx_size;
   TC_NVRTC_CHECK(nvrtcGetPTXSize(prog, &ptx_size));
-  res->nvrtc_ptx = std::vector<char>(ptx_size);
-  TC_NVRTC_CHECK(nvrtcGetPTX(prog, res->nvrtc_ptx.data()));
+  std::vector<char> res(ptx_size);
+  TC_NVRTC_CHECK(nvrtcGetPTX(prog, res.data()));
   TC_NVRTC_CHECK(nvrtcDestroyProgram(&prog));
+  return std::string(res.begin(), res.end());
+}
+} // namespace
+
+std::unique_ptr<CudaRTCFunction> CudaRTCFunction::Compile(
+    const std::string& name,
+    const std::string& source) {
+  std::unique_ptr<CudaRTCFunction> res(new CudaRTCFunction());
+  res->specializedName = name;
+  res->cleared_ = false;
+  if (FLAGS_debug_tc_mapper) {
+    LOG(INFO) << "NVRTC function source:\n" << source;
+  }
+  if (FLAGS_cuda_compiler == "nvrtc") {
+    res->ptx = nvrtcCompile(name, source);
+  } else if (FLAGS_cuda_compiler == "llvm") {
+    res->ptx = llvmCompile(name, source);
+  } else if (FLAGS_cuda_compiler == "nvcc") {
+    CHECK(false) << "NYI";
+    // res->ptx = llvmCompile(name, source);
+  } else {
+    CHECK(false) << "Unknown CUDA compiler: " << FLAGS_cuda_compiler;
+  }
   if (FLAGS_dump_ptx) {
-    LOG(INFO) << "PTX:\n" << std::string(res->nvrtc_ptx.data());
+    LOG(INFO) << "PTX:\n" << res->ptx;
   }
   return res;
 }
+
 namespace {
 
 template <typename T>
@@ -164,8 +251,11 @@ Duration CudaRTCFunction::Launch(
     // This call to cudaDeviceSynchronize implicitly creates a new context if
     // one is not bound to the current CPU.
     checkOrCreateContext();
-    TC_CUDA_DRIVERAPI_ENFORCE(
-        cuModuleLoadDataEx(&module, nvrtc_ptx.data(), 0, 0, 0));
+    auto res = cuModuleLoadData(&module, ptx.c_str());
+    if (res != CUDA_SUCCESS) {
+      LOG(ERROR) << "Invalid PTX: " << ptx;
+    }
+    TC_CUDA_DRIVERAPI_ENFORCE(res);
     perGpuModule_.emplace(dev, module);
     TC_CUDA_DRIVERAPI_ENFORCE(
         cuModuleGetFunction(&function, module, specializedName.c_str()));
diff --git a/tc/core/cuda/cuda_rtc.h b/tc/core/cuda/cuda_rtc.h
@@ -65,7 +65,7 @@ class CudaRTCFunction {
   mutable std::unordered_map<size_t, CUmodule> perGpuModule_;
   mutable std::unordered_map<size_t, CUfunction> perGpuKernel_;
   std::string specializedName;
-  std::vector<char> nvrtc_ptx;
+  std::string ptx;
   bool cleared_;
 };
 
diff --git a/tc/core/flags.cc b/tc/core/flags.cc
@@ -38,6 +38,16 @@ DEFINE_bool(
 DEFINE_bool(dump_cuda, false, "Print the generated source");
 DEFINE_bool(dump_ptx, false, "Dump the generated PTX");
 
+// PTX generation
+DEFINE_string(
+    cuda_compiler,
+    "nvrtc",
+    "which compiler to use to emit ptx: nvrtc, llvm, nvcc (default [nvrtc])");
+DEFINE_string(
+    llvm_flags,
+    "-std=c++11 -O3 -ffast-math",
+    "compiler flags to set when llvm is used");
+
 // CPU codegen options
 DEFINE_bool(llvm_dump_before_opt, false, "Print IR before optimization");
 DEFINE_bool(llvm_dump_after_opt, false, "Print IR after optimization");
diff --git a/tc/core/flags.h b/tc/core/flags.h
@@ -31,6 +31,10 @@ DECLARE_bool(debug_tuner);
 DECLARE_bool(dump_cuda);
 DECLARE_bool(dump_ptx);
 
+// ptx generation
+DECLARE_string(cuda_compiler);
+DECLARE_string(llvm_flags);
+
 // llvm codegen
 DECLARE_bool(llvm_dump_before_opt);
 DECLARE_bool(llvm_dump_after_opt);
diff --git a/tc/tc_config.h.in b/tc/tc_config.h.in
@@ -24,4 +24,5 @@
 #define TC_CUDA_TOOLKIT_ROOT_DIR @TC_CUDA_TOOLKIT_ROOT_DIR@
 #define TC_CUDA_INCLUDE_DIR @TC_CUDA_INCLUDE_DIR@
 #define TC_CUB_INCLUDE_DIR @TC_CUB_INCLUDE_DIR@
+#define TC_LLVM_BIN_DIR @TC_LLVM_BIN_DIR@
 // clang-format on
diff --git a/tensor_comprehensions/__init__.py b/tensor_comprehensions/__init__.py
@@ -29,6 +29,8 @@
 from tensor_comprehensions.tclib import debug_tuner
 from tensor_comprehensions.tclib import dump_cuda
 from tensor_comprehensions.tclib import dump_ptx
+from tensor_comprehensions.tclib import cuda_compiler
+from tensor_comprehensions.tclib import llvm_flags
 
 from tensor_comprehensions.tclib import CompilationCache
 from tensor_comprehensions.tclib import MappingOptions
@@ -606,6 +608,8 @@ def make_autograd(forward_fun: Callable[[Iterable[torch.Tensor]], Iterable[torch
     'debug_tuner',
     'dump_cuda',
     'dump_ptx',
+    'cuda_compiler',
+    'llvm_flags',
     # Functions exposed by the tclib
     'compile',
     'autotune',
diff --git a/tensor_comprehensions/pybinds/tclib.cc b/tensor_comprehensions/pybinds/tclib.cc
@@ -439,6 +439,19 @@ PYBIND11_MODULE(tclib, m) {
   });
   m.def("dump_cuda", [](bool dump_cuda) { tc::FLAGS_dump_cuda = dump_cuda; });
   m.def("dump_ptx", [](bool dump_ptx) { tc::FLAGS_dump_ptx = dump_ptx; });
+  m.def(
+      "cuda_compiler",
+      [](const std::string& cuda_compiler) {
+        tc::FLAGS_cuda_compiler = cuda_compiler;
+      },
+      gflags::DescribeOneFlag(
+          gflags::GetCommandLineFlagInfoOrDie("cuda_compiler"))
+          .c_str());
+  m.def(
+      "llvm_flags",
+      [](const std::string& llvm_flags) { tc::FLAGS_llvm_flags = llvm_flags; },
+      gflags::DescribeOneFlag(gflags::GetCommandLineFlagInfoOrDie("llvm_flags"))
+          .c_str());
 
   // Access the names of the defs in a TC string
   m.def("parse_defs", [](const std::string& tc) {