Skip to content
This repository was archived by the owner on Apr 28, 2023. It is now read-only.

Commit cacdc59

Browse files
author
Sven Verdoolaege
committed
add wrapper around __ldg for those poor souls with a very old CUDA card
__ldg is apparently only supported since 3.5. Add a wrapper to ignore the __ldg call on older architectures. The wrapper is put inside the __tc namespace in analogy with __tc::CubReduceAlongX, even though names that start with a double underscore are supposed to be reserved. The wrapper was inspired by https://stackoverflow.com/a/27302007
1 parent 639ee29 commit cacdc59

File tree

4 files changed

+26
-3
lines changed

4 files changed

+26
-3
lines changed

tc/core/libraries.h

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,25 @@ template<typename T> inline __device__ T floord(T n, T d) {
156156
} // namespace cpp
157157

158158
namespace cuda {
159+
160+
// Wrapper around __ldg to avoid compilation errors
161+
// on architectures that do not support it.
162+
constexpr auto ldg = R"CUDA(
163+
164+
namespace __tc {
165+
template<typename T>
166+
__device__ __forceinline__ T ldg(const T* ptr) {
167+
#if __CUDA_ARCH__ >= 350
168+
return __ldg(ptr);
169+
#else
170+
return *ptr;
171+
#endif
172+
}
173+
} // namespace __tc
174+
)CUDA";
175+
176+
const static std::string kLdg = "__tc::ldg";
177+
159178
constexpr auto common = R"CUDA(
160179
161180
namespace __tc {

tc/core/polyhedral/cuda/codegen.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -381,7 +381,7 @@ struct LdgWrapper {
381381
LdgWrapper(const CodegenStatementContext& context, isl::id id)
382382
: readOnly_(context.readOnlySet.count(id) > 0), out_(context.ss) {
383383
if (readOnly_) {
384-
out_ << "__ldg(&";
384+
out_ << tc::code::cuda::kLdg << "(&";
385385
}
386386
}
387387

tc/core/polyhedral/cuda/mapped_scop.cc

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -919,6 +919,9 @@ std::tuple<std::string, tc::Grid, tc::Block> MappedScop::codegen(
919919
code << code::cpp::boundsAsTemplate << code::c::types << code::c::defines;
920920
code << code::c::warpSyncFunctions;
921921
code << std::endl;
922+
if (useReadOnlyCache) {
923+
code << code::cuda::ldg;
924+
}
922925
if (mappedScopForCodegen->scop().treeSyncUpdateMap.size() != 0) {
923926
code << code::cuda::common;
924927
code << code::cuda::cubBlockReduce;

test/test_cuda_mapper.cc

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1018,8 +1018,9 @@ def fun(float(N) I) -> (O) {
10181018
)TC";
10191019
auto mappingOptions = DefaultOptions().useReadOnlyCache(true);
10201020
auto code = codegenMapped(tc, mappingOptions);
1021-
ASSERT_TRUE(code.find("__ldg(&O") == std::string::npos) << code; // no
1022-
ASSERT_TRUE(code.find("__ldg(&I") != std::string::npos) << code; // yes
1021+
using tc::code::cuda::kLdg;
1022+
ASSERT_TRUE(code.find(kLdg + "(&O") == std::string::npos) << code; // no
1023+
ASSERT_TRUE(code.find(kLdg + "(&I") != std::string::npos) << code; // yes
10231024
}
10241025

10251026
/*

0 commit comments

Comments
 (0)