|
13 | 13 | * See the License for the specific language governing permissions and
|
14 | 14 | * limitations under the License.
|
15 | 15 | */
|
| 16 | +#include <cstdio> |
| 17 | +#include <cstdlib> |
| 18 | +#include <fstream> |
| 19 | +#include <iostream> |
16 | 20 | #include <sstream>
|
17 | 21 | #include <string>
|
18 | 22 | #include <vector>
|
@@ -60,30 +64,89 @@ void checkOrCreateContext() {
|
60 | 64 | }
|
61 | 65 | }
|
62 | 66 |
|
63 |
| -std::unique_ptr<CudaRTCFunction> CudaRTCFunction::Compile( |
64 |
| - const std::string& name, |
65 |
| - const std::string& source) { |
66 |
| - std::unique_ptr<CudaRTCFunction> res(new CudaRTCFunction()); |
67 |
| - res->specializedName = name; |
68 |
| - res->cleared_ = false; |
69 |
| - |
70 |
| - if (FLAGS_debug_tc_mapper) { |
71 |
| - LOG(INFO) << "NVRTC function source:\n" << source; |
72 |
| - } |
73 |
| - // Actually do the compiling. |
74 |
| - nvrtcProgram prog; |
75 |
| - TC_NVRTC_CHECK( |
76 |
| - nvrtcCreateProgram(&prog, source.c_str(), nullptr, 0, nullptr, nullptr)); |
77 |
| - |
78 |
| - // Get the architecture of the current device. |
79 |
| - int device, minor, major; |
| 67 | +namespace { |
| 68 | +static std::tuple<int, int, int> getCudaArchitecture() { |
| 69 | + int device, major, minor; |
80 | 70 | CUdevice deviceHandle;
|
81 | 71 | TC_CUDA_RUNTIMEAPI_ENFORCE(cudaGetDevice(&device));
|
82 | 72 | TC_CUDA_DRIVERAPI_ENFORCE(cuDeviceGet(&deviceHandle, device));
|
83 | 73 | TC_CUDA_DRIVERAPI_ENFORCE(cuDeviceGetAttribute(
|
84 | 74 | &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, deviceHandle));
|
85 | 75 | TC_CUDA_DRIVERAPI_ENFORCE(cuDeviceGetAttribute(
|
86 | 76 | &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, deviceHandle));
|
| 77 | + return std::tuple<int, int, int>(device, major, minor); |
| 78 | +} |
| 79 | + |
| 80 | +static std::string llvmCompile( |
| 81 | + const std::string& name, |
| 82 | + const std::string& source) { |
| 83 | + int device, major, minor; |
| 84 | + std::tie(device, major, minor) = getCudaArchitecture(); |
| 85 | + |
| 86 | + std::string pat("/tmp/cudaXXXXXX"); |
| 87 | + std::vector<char> ifn(pat.begin(), pat.end()); |
| 88 | + TC_CHECK_GE(mkstemp(ifn.data()), 0); // string.c_str is const char* |
| 89 | + std::string inputFileName(ifn.begin(), ifn.end()); |
| 90 | + // cstdio's std::remove to delete files |
| 91 | + tc::ScopeGuard sgi([&]() { std::remove(inputFileName.c_str()); }); |
| 92 | + { |
| 93 | + std::ofstream ostream(inputFileName, std::ios::binary); |
| 94 | + ostream << source; |
| 95 | + } |
| 96 | + |
| 97 | + std::string arch = "sm_" + std::to_string(major) + std::to_string(minor); |
| 98 | + std::string outputClangFile = inputFileName + "-clang.ll"; |
| 99 | + std::string outputLinkFile = inputFileName + "-link.ll"; |
| 100 | + std::string outputOptFile = inputFileName + "-opt.ll"; |
| 101 | + std::string outputPtxFile = inputFileName + ".s"; |
| 102 | + tc::ScopeGuard sgo([&]() { |
| 103 | + // cstdio's std::remove to delete files |
| 104 | + std::remove(outputClangFile.c_str()); |
| 105 | + std::remove(outputLinkFile.c_str()); |
| 106 | + std::remove(outputOptFile.c_str()); |
| 107 | + std::remove(outputPtxFile.c_str()); |
| 108 | + }); |
| 109 | + |
| 110 | + std::string cmdLlvmIr = std::string(TC_STRINGIFY(TC_LLVM_BIN_DIR)) + |
| 111 | + "/clang++ -x cuda " + inputFileName + " " + "--cuda-device-only " + |
| 112 | + "--cuda-gpu-arch=" + arch + " " + |
| 113 | + "--cuda-path=" + TC_STRINGIFY(TC_CUDA_TOOLKIT_ROOT_DIR) + " " + "-I" + |
| 114 | + TC_STRINGIFY(TC_CUDA_INCLUDE_DIR) + " " + "-I" + |
| 115 | + TC_STRINGIFY(TC_CUB_INCLUDE_DIR) + " " + tc::FLAGS_llvm_flags + |
| 116 | + " -DNVRTC_CUB=1 " + "-nocudalib -S -emit-llvm " + "-o " + |
| 117 | + outputClangFile; |
| 118 | + TC_CHECK_EQ(std::system(cmdLlvmIr.c_str()), 0) << cmdLlvmIr; |
| 119 | + |
| 120 | + std::string cmdLlvmLink = std::string(TC_STRINGIFY(TC_LLVM_BIN_DIR)) + |
| 121 | + "/llvm-link " + outputClangFile + " " + |
| 122 | + TC_STRINGIFY(TC_CUDA_TOOLKIT_ROOT_DIR) + |
| 123 | + "/nvvm/libdevice/libdevice.*.bc " + "-S -o " + outputLinkFile; |
| 124 | + TC_CHECK_EQ(std::system(cmdLlvmLink.c_str()), 0) << cmdLlvmLink; |
| 125 | + |
| 126 | + std::string cmdOpt = std::string(TC_STRINGIFY(TC_LLVM_BIN_DIR)) + "/opt " + |
| 127 | + "-internalize -internalize-public-api-list=" + name + " " + |
| 128 | + "-nvvm-reflect -O3 " + outputLinkFile + " -S -o " + outputOptFile; |
| 129 | + TC_CHECK_EQ(std::system(cmdOpt.c_str()), 0) << cmdOpt; |
| 130 | + |
| 131 | + std::string cmdPtx = std::string(TC_STRINGIFY(TC_LLVM_BIN_DIR)) + |
| 132 | + "/llc -mcpu=" + arch + " " + outputOptFile + " -o " + outputPtxFile; |
| 133 | + TC_CHECK_EQ(std::system(cmdPtx.c_str()), 0) << cmdPtx; |
| 134 | + |
| 135 | + std::ifstream stream(outputPtxFile); |
| 136 | + return std::string( |
| 137 | + (std::istreambuf_iterator<char>(stream)), |
| 138 | + std::istreambuf_iterator<char>()); |
| 139 | +} |
| 140 | + |
| 141 | +static std::string nvrtcCompile( |
| 142 | + const std::string& name, |
| 143 | + const std::string& source) { |
| 144 | + int device, major, minor; |
| 145 | + std::tie(device, major, minor) = getCudaArchitecture(); |
| 146 | + |
| 147 | + nvrtcProgram prog; |
| 148 | + TC_NVRTC_CHECK( |
| 149 | + nvrtcCreateProgram(&prog, source.c_str(), nullptr, 0, nullptr, nullptr)); |
87 | 150 |
|
88 | 151 | std::stringstream arch_param;
|
89 | 152 | arch_param << "--gpu-architecture=compute_" << major << minor;
|
@@ -125,14 +188,38 @@ std::unique_ptr<CudaRTCFunction> CudaRTCFunction::Compile(
|
125 | 188 | }
|
126 | 189 | size_t ptx_size;
|
127 | 190 | TC_NVRTC_CHECK(nvrtcGetPTXSize(prog, &ptx_size));
|
128 |
| - res->nvrtc_ptx = std::vector<char>(ptx_size); |
129 |
| - TC_NVRTC_CHECK(nvrtcGetPTX(prog, res->nvrtc_ptx.data())); |
| 191 | + std::vector<char> res(ptx_size); |
| 192 | + TC_NVRTC_CHECK(nvrtcGetPTX(prog, res.data())); |
130 | 193 | TC_NVRTC_CHECK(nvrtcDestroyProgram(&prog));
|
| 194 | + return std::string(res.begin(), res.end()); |
| 195 | +} |
| 196 | +} // namespace |
| 197 | + |
| 198 | +std::unique_ptr<CudaRTCFunction> CudaRTCFunction::Compile( |
| 199 | + const std::string& name, |
| 200 | + const std::string& source) { |
| 201 | + std::unique_ptr<CudaRTCFunction> res(new CudaRTCFunction()); |
| 202 | + res->specializedName = name; |
| 203 | + res->cleared_ = false; |
| 204 | + if (FLAGS_debug_tc_mapper) { |
| 205 | + LOG(INFO) << "NVRTC function source:\n" << source; |
| 206 | + } |
| 207 | + if (FLAGS_cuda_compiler == "nvrtc") { |
| 208 | + res->ptx = nvrtcCompile(name, source); |
| 209 | + } else if (FLAGS_cuda_compiler == "llvm") { |
| 210 | + res->ptx = llvmCompile(name, source); |
| 211 | + } else if (FLAGS_cuda_compiler == "nvcc") { |
| 212 | + CHECK(false) << "NYI"; |
| 213 | + // res->ptx = llvmCompile(name, source); |
| 214 | + } else { |
| 215 | + CHECK(false) << "Unknown CUDA compiler: " << FLAGS_cuda_compiler; |
| 216 | + } |
131 | 217 | if (FLAGS_dump_ptx) {
|
132 |
| - LOG(INFO) << "PTX:\n" << std::string(res->nvrtc_ptx.data()); |
| 218 | + LOG(INFO) << "PTX:\n" << res->ptx; |
133 | 219 | }
|
134 | 220 | return res;
|
135 | 221 | }
|
| 222 | + |
136 | 223 | namespace {
|
137 | 224 |
|
138 | 225 | template <typename T>
|
@@ -164,8 +251,11 @@ Duration CudaRTCFunction::Launch(
|
164 | 251 | // This call to cudaDeviceSynchronize implicitly creates a new context if
|
165 | 252 | // one is not bound to the current CPU.
|
166 | 253 | checkOrCreateContext();
|
167 |
| - TC_CUDA_DRIVERAPI_ENFORCE( |
168 |
| - cuModuleLoadDataEx(&module, nvrtc_ptx.data(), 0, 0, 0)); |
| 254 | + auto res = cuModuleLoadData(&module, ptx.c_str()); |
| 255 | + if (res != CUDA_SUCCESS) { |
| 256 | + LOG(ERROR) << "Invalid PTX: " << ptx; |
| 257 | + } |
| 258 | + TC_CUDA_DRIVERAPI_ENFORCE(res); |
169 | 259 | perGpuModule_.emplace(dev, module);
|
170 | 260 | TC_CUDA_DRIVERAPI_ENFORCE(
|
171 | 261 | cuModuleGetFunction(&function, module, specializedName.c_str()));
|
|
0 commit comments