Skip to content
This repository was archived by the owner on Apr 28, 2023. It is now read-only.

Support -mpcu flag and autodetect to trigger vectorization on proper vector lengths #588

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 6 additions & 16 deletions tc/core/cuda/cuda_rtc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#include "tc/core/cuda/cuda_rtc.h"
#include "tc/core/flags.h"
#include "tc/core/scope_guard.h"
#include "tc/core/utils/system.h"

namespace tc {
std::mutex nvrtc_mutex;
Expand Down Expand Up @@ -65,17 +66,6 @@ void checkOrCreateContext() {
}

namespace {
static void checkedSystemCall(
const std::string& cmd,
const std::vector<std::string>& args) {
std::stringstream command;
command << cmd << " ";
for (const auto& s : args) {
command << s << " ";
}
TC_CHECK_EQ(std::system(command.str().c_str()), 0) << command.str();
}

static std::tuple<int, int, int> getCudaArchitecture() {
int device, major, minor;
CUdevice deviceHandle;
Expand Down Expand Up @@ -119,7 +109,7 @@ static std::string llvmCompile(
});

// Compile
checkedSystemCall(
utils::checkedSystemCall(
std::string(TC_STRINGIFY(TC_LLVM_BIN_DIR)) + "/clang++",
{"-x cuda " + inputFileName,
"--cuda-device-only",
Expand All @@ -134,7 +124,7 @@ static std::string llvmCompile(
"-o " + outputClangFile});

// Link libdevice before opt
checkedSystemCall(
utils::checkedSystemCall(
std::string(TC_STRINGIFY(TC_LLVM_BIN_DIR)) + "/llvm-link ",
{outputClangFile,
std::string(TC_STRINGIFY(TC_CUDA_TOOLKIT_ROOT_DIR)) +
Expand All @@ -143,7 +133,7 @@ static std::string llvmCompile(
"-o " + outputLinkFile});

// Opt
checkedSystemCall(
utils::checkedSystemCall(
std::string(TC_STRINGIFY(TC_LLVM_BIN_DIR)) + "/opt",
{"-internalize",
std::string("-internalize-public-api-list=") + name,
Expand All @@ -154,7 +144,7 @@ static std::string llvmCompile(
std::string("-o ") + outputOptFile});

// Ptx
checkedSystemCall(
utils::checkedSystemCall(
std::string(TC_STRINGIFY(TC_LLVM_BIN_DIR)) + "/llc",
{std::string("-mcpu=") + arch,
outputOptFile,
Expand Down Expand Up @@ -188,7 +178,7 @@ static std::string nvccCompile(
// cstdio's std::remove to delete files
tc::ScopeGuard sgo([&]() { std::remove(outputPtxFile.c_str()); });

checkedSystemCall(
utils::checkedSystemCall(
std::string(TC_STRINGIFY(TC_CUDA_TOOLKIT_ROOT_DIR)) + "/bin/nvcc",
{"-x cu",
inputFileName,
Expand Down
6 changes: 6 additions & 0 deletions tc/core/flags.cc
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,14 @@ DEFINE_string(
"compiler flags to set when nvcc is used");

// CPU codegen options
DEFINE_string(mcpu, "", "see llvm's --mcpu");
DEFINE_bool(llvm_dump_before_opt, false, "Print IR before optimization");
DEFINE_bool(llvm_dump_after_opt, false, "Print IR after optimization");
DEFINE_bool(llvm_dump_asm, false, "Print asm");
DEFINE_string(
llvm_dump_asm_options,
"-filetype=asm",
"Options used when dumping asm");

DEFINE_uint32(
benchmark_warmup,
Expand Down
5 changes: 4 additions & 1 deletion tc/core/flags.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,12 @@ DECLARE_string(cuda_compiler);
DECLARE_string(llvm_flags);
DECLARE_string(nvcc_flags);

// llvm codegen
// CPU codegen options
DECLARE_string(mcpu);
DECLARE_bool(llvm_dump_before_opt);
DECLARE_bool(llvm_dump_after_opt);
DECLARE_bool(llvm_dump_asm);
DECLARE_string(llvm_dump_asm_options);

// Used in benchmarking and autotuning
DECLARE_uint32(benchmark_warmup);
Expand Down
Loading