Skip to content
This repository was archived by the owner on Apr 28, 2023. It is now read-only.

Commit 30a1634

Browse files
ftynseSven Verdoolaege
authored andcommitted
Add support for __ldg
CUDA supports a faster read-only cache for data that is never written. This commits adds support in cuda/codegen to generate such accesses. This is achieved by simply inspecting the schedule tree for all read-only tensor references and saving their isl::id in a set that is passed in the context. The corresponding CudaMappingOptions are also added to control triggering of the option. A simple test is also added.
1 parent 9f9e74c commit 30a1634

File tree

14 files changed

+138
-16
lines changed

14 files changed

+138
-16
lines changed

docs/source/mapping_options.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,8 @@ The following options are currently available:
7575

7676
* :code:`.unrollCopyShared(<boolean>)`: Also unroll the copies to and from shared memory introduced by the :code:`TC` mapper. If :code:`unroll` value is not provided, has no effect.
7777

78+
* :code:`.useReaOnlyCache(<boolean>)`: Emit loads to the readonly cache when appropriate.
79+
7880
* :code:`.matchLibraryCalls(<boolean>)`: Replace computation patterns with calls to highly optimized libraries (such as CUB, CUTLASS) when possible.
7981

8082
* :code:`.fixParametersBeforeScheduling(<boolean>)`: Perform automatic loop scheduling taking into account specific tensor sizes. May produce faster kernels but significantly increases compilation time. Note that the *mapping* will be performed for specific tensor sizes anyway.

tc/autotuner/parameters.cc

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -235,6 +235,7 @@ void TuningConfiguration::applyToParameters(
235235
useSharedMemory.apply(f);
236236
usePrivateMemory.apply(f);
237237
unrollCopyShared.apply(f);
238+
useReadOnlyCache.apply(f);
238239
matchLibraryCalls.apply(f);
239240
}
240241

@@ -268,6 +269,7 @@ std::vector<ParameterView> TuningConfiguration::collectParameters() {
268269
params.emplace_back(useSharedMemory);
269270
params.emplace_back(usePrivateMemory);
270271
params.emplace_back(unrollCopyShared);
272+
params.emplace_back(useReadOnlyCache);
271273
params.emplace_back(matchLibraryCalls);
272274

273275
return params;
@@ -298,6 +300,7 @@ void TuningConfiguration::fromCudaMappingOptions(
298300
useSharedMemory.selectValue(options.proto().use_shared_memory());
299301
usePrivateMemory.selectValue(options.proto().use_private_memory());
300302
unrollCopyShared.selectValue(options.proto().unroll_copy_shared());
303+
useReadOnlyCache.selectValue(options.proto().use_readonly_cache());
301304
}
302305

303306
void TuningConfiguration::fromCpuMappingOptions(
@@ -325,6 +328,7 @@ void TuningConfiguration::applyToCudaMappingOptions(
325328
options.useSharedMemory(useSharedMemory.value());
326329
options.usePrivateMemory(usePrivateMemory.value());
327330
options.unrollCopyShared(unrollCopyShared.value());
331+
options.useReadOnlyCache(useReadOnlyCache.value());
328332
}
329333

330334
void TuningConfiguration::applyToCpuMappingOptions(
@@ -338,6 +342,7 @@ TuningConfiguration::TuningConfiguration()
338342
useSharedMemory("use shared memory"),
339343
usePrivateMemory("use private memory"),
340344
unrollCopyShared("unroll copy shared"),
345+
useReadOnlyCache("use readonly cache (i.e. emit __ldg loads)"),
341346
matchLibraryCalls("match library calls") {
342347
addValidator([](const TuningConfiguration& conf) {
343348
auto b0v = conf.blockParams.dims.at(0).value();
@@ -419,6 +424,7 @@ void TuningConfiguration::fixParameters(
419424
maybeFixScalar(fixedParams.useSharedMemory, useSharedMemory);
420425
maybeFixScalar(fixedParams.usePrivateMemory, usePrivateMemory);
421426
maybeFixScalar(fixedParams.unrollCopyShared, unrollCopyShared);
427+
maybeFixScalar(fixedParams.useReadOnlyCache, useReadOnlyCache);
422428
maybeFixScalar(fixedParams.matchLibraryCalls, matchLibraryCalls);
423429
}
424430

@@ -568,6 +574,11 @@ TuningParameterFixer& TuningParameterFixer::fixUnrollCopyShared(bool val) {
568574
return *this;
569575
}
570576

577+
TuningParameterFixer& TuningParameterFixer::fixUseReadOnlyCache(bool val) {
578+
useReadOnlyCache = val;
579+
return *this;
580+
}
581+
571582
TuningParameterFixer& TuningParameterFixer::fixMatchLibraryCalls(bool val) {
572583
matchLibraryCalls = val;
573584
return *this;

tc/autotuner/parameters.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,7 @@ class TuningConfiguration {
188188
BoolParameter useSharedMemory;
189189
BoolParameter usePrivateMemory;
190190
BoolParameter unrollCopyShared;
191+
BoolParameter useReadOnlyCache;
191192
BoolParameter matchLibraryCalls;
192193

193194
private:
@@ -209,6 +210,7 @@ class TuningParameterFixer {
209210
TuningParameterFixer& fixUseSharedMemory(bool val);
210211
TuningParameterFixer& fixUsePrivateMemory(bool val);
211212
TuningParameterFixer& fixUnrollCopyShared(bool val);
213+
TuningParameterFixer& fixUseReadOnlyCache(bool val);
212214
TuningParameterFixer& fixMatchLibraryCalls(bool val);
213215

214216
private:
@@ -223,6 +225,7 @@ class TuningParameterFixer {
223225
llvm::Optional<bool> useSharedMemory;
224226
llvm::Optional<bool> usePrivateMemory;
225227
llvm::Optional<bool> unrollCopyShared;
228+
llvm::Optional<bool> useReadOnlyCache;
226229
llvm::Optional<bool> matchLibraryCalls;
227230

228231
friend class TuningConfiguration;

tc/core/cuda/cuda_mapping_options.cc

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -311,6 +311,11 @@ CudaMappingOptions& CudaMappingOptions::unrollCopyShared(bool b) {
311311
return *this;
312312
}
313313

314+
CudaMappingOptions& CudaMappingOptions::useReadOnlyCache(bool b) {
315+
ownedProto_.set_use_readonly_cache(b);
316+
return *this;
317+
}
318+
314319
CudaMappingOptions& CudaMappingOptions::mapToThreads(
315320
const std::string& commaSeparatedSizes) {
316321
auto sizes = parseCommaSeparatedIntegers<uint64_t>(commaSeparatedSizes);
@@ -341,7 +346,8 @@ CudaMappingOptions CudaMappingOptions::makeUnmappedMappingOptions() {
341346
mo.genericMappingOptions(MappingOptions::makeUnmappedMappingOptions())
342347
.useSharedMemory(false)
343348
.usePrivateMemory(false)
344-
.unrollCopyShared(false);
349+
.unrollCopyShared(false)
350+
.useReadOnlyCache(false);
345351
return mo;
346352
}
347353

tc/core/cuda/cuda_mapping_options.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,7 @@ class CudaMappingOptions {
194194
CudaMappingOptions& usePrivateMemory(bool b);
195195
CudaMappingOptions& maxSharedMemory(uint64_t size);
196196
CudaMappingOptions& unrollCopyShared(bool b);
197+
CudaMappingOptions& useReadOnlyCache(bool b);
197198
///@}
198199

199200
/// Static constructors for predefined strategies.

tc/core/cuda/cuda_mapping_options_cpp_printer.cc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@ CudaMappingOptionsCppPrinter& operator<<(
3232
"usePrivateMemory", cudaOptions.proto().use_private_memory());
3333
prn.printBooleanOption(
3434
"unrollCopyShared", cudaOptions.proto().unroll_copy_shared());
35+
prn.printBooleanOption(
36+
"useReadOnlyCache", cudaOptions.proto().use_readonly_cache());
3537
if (cudaOptions.proto().has_max_shared_memory()) {
3638
prn.printValueOption(
3739
"maxSharedMemory", cudaOptions.proto().max_shared_memory());

tc/core/polyhedral/cuda/codegen.cc

Lines changed: 52 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -365,8 +365,32 @@ void emitReductionInit(
365365
}
366366

367367
namespace {
368+
// RAII-style wrapper around CodegenStatementContext that wraps the output
369+
// streamed to context.ss into an "__ldg()" intrinsic, if the tensor
370+
// with the given identifier is known to be read-only.
371+
struct LdgWrapper {
372+
public:
373+
LdgWrapper(const CodegenStatementContext& context, isl::id id)
374+
: readOnly_(context.readOnlySet.count(id) > 0), out_(context.ss) {
375+
if (readOnly_) {
376+
out_ << "__ldg(&";
377+
}
378+
}
379+
380+
~LdgWrapper() {
381+
if (readOnly_) {
382+
out_ << ")";
383+
}
384+
}
385+
386+
private:
387+
bool readOnly_;
388+
std::ostream& out_;
389+
};
390+
368391
template <typename AFF>
369392
void emitAccess(AFF access, const CodegenStatementContext& context) {
393+
LdgWrapper ldgWrapper(context, access.get_tuple_id(isl::dim_type::out));
370394
context.ss << context.build().access_from(access).to_C_str();
371395
}
372396
} // namespace
@@ -584,6 +608,8 @@ void emitMappedTensorAccess(
584608

585609
// Not promoted, emitting just the mapped subscript.
586610
if (!promotionInfo.groupId) {
611+
auto ctx = context.scop().domain().get_ctx();
612+
LdgWrapper ldgWrapper(context, isl::id(ctx, name));
587613
context.ss << name;
588614
for (auto e : subscripts) {
589615
context.ss << "[";
@@ -681,6 +707,29 @@ size_t& nAstNodes() {
681707
return n;
682708
}
683709

710+
namespace {
711+
// Collect ids of tensors that are only read on the Scop.
712+
std::unordered_set<isl::id, isl::IslIdIslHash> gatherReadOnlySet(
713+
const MappedScop& mscop) {
714+
std::unordered_set<isl::id, isl::IslIdIslHash> readOnlySet;
715+
716+
if (!mscop.useReadOnlyCache) {
717+
return readOnlySet;
718+
}
719+
720+
const auto& scop = mscop.scop();
721+
722+
auto read = scop.reads.universe().range();
723+
auto written = scop.writes.universe().range();
724+
auto readOnly = read.subtract(written);
725+
for (auto s : readOnly.get_set_list()) {
726+
readOnlySet.emplace(s.get_tuple_id());
727+
}
728+
729+
return readOnlySet;
730+
}
731+
} // namespace
732+
684733
string emitCudaKernel(
685734
const std::string& specializedName,
686735
const MappedScop& mscop) {
@@ -745,7 +794,9 @@ string emitCudaKernel(
745794
auto root = mscop.schedule();
746795
astBuild = astBuild.set_iterators(Codegen::makeLoopIterators(root));
747796
auto astNode = astBuild.node_from(schedule);
748-
AstPrinter(CodegenContext(ss, mscop, nodeInfoMap)).emit(astNode);
797+
798+
AstPrinter(CodegenContext(ss, mscop, nodeInfoMap, gatherReadOnlySet(mscop)))
799+
.emit(astNode);
749800
ss << "}" << endl;
750801

751802
return ss.str();

tc/core/polyhedral/cuda/codegen.h

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -83,10 +83,14 @@ struct CodegenContext {
8383
CodegenContext(
8484
std::stringstream& ss_,
8585
const MappedScop& s,
86-
const NodeInfoMapType& i)
87-
: ss(ss_), mappedScop(s), nodeInfoMap(i) {}
86+
const NodeInfoMapType& i,
87+
const std::unordered_set<isl::id, isl::IslIdIslHash>& ros)
88+
: ss(ss_), mappedScop(s), nodeInfoMap(i), readOnlySet(ros) {}
8889
CodegenContext(const CodegenContext& c)
89-
: ss(c.ss), mappedScop(c.mappedScop), nodeInfoMap(c.nodeInfoMap) {}
90+
: ss(c.ss),
91+
mappedScop(c.mappedScop),
92+
nodeInfoMap(c.nodeInfoMap),
93+
readOnlySet(c.readOnlySet) {}
9094

9195
const Scop& scop() const {
9296
return mappedScop.scop();
@@ -95,6 +99,7 @@ struct CodegenContext {
9599
std::stringstream& ss;
96100
const MappedScop& mappedScop;
97101
const NodeInfoMapType& nodeInfoMap;
102+
const std::unordered_set<isl::id, isl::IslIdIslHash>& readOnlySet;
98103
};
99104

100105
struct CodegenStatementContext : CodegenContext {

tc/core/polyhedral/cuda/mapped_scop.cc

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -908,7 +908,11 @@ std::unique_ptr<MappedScop> makeSpecializedMappedScop(
908908
tc::Block block = mappedScop.numThreads;
909909
std::tie(grid, block) = tightenLaunchBounds(*scop, grid, block);
910910
auto res = MappedScop::makeMappedScop(
911-
std::move(scop), grid, block, mappedScop.unroll);
911+
std::move(scop),
912+
grid,
913+
block,
914+
mappedScop.unroll,
915+
mappedScop.useReadOnlyCache);
912916
res->insertMappingContext();
913917

914918
LOG_IF(INFO, FLAGS_debug_tc_mapper)
@@ -985,7 +989,8 @@ std::unique_ptr<MappedScop> MappedScop::makeWithOuterBlockInnerThreadStrategy(
985989
std::move(scopUPtr),
986990
::tc::Grid(cudaOptions.grid),
987991
::tc::Block(cudaOptions.block),
988-
generic.proto.unroll()));
992+
generic.proto.unroll(),
993+
cudaOptions.proto().use_readonly_cache()));
989994
auto& scop = mappedScop->scop_;
990995

991996
// 1a. Optionally specialize before scheduling...

tc/core/polyhedral/cuda/mapped_scop.h

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -61,25 +61,28 @@ class MappedScop {
6161
std::unique_ptr<Scop>&& scop,
6262
::tc::Grid grid,
6363
::tc::Block block,
64-
uint64_t unroll_)
64+
uint64_t unroll_,
65+
bool useReadOnlyCache_)
6566
: scop_(std::move(scop)),
6667
numBlocks(grid),
6768
numThreads(block),
68-
unroll(unroll_) {}
69+
unroll(unroll_),
70+
useReadOnlyCache(useReadOnlyCache_) {}
6971

7072
public:
7173
static inline std::unique_ptr<MappedScop> makeOneBlockOneThread(
7274
std::unique_ptr<Scop>&& scop) {
7375
return std::unique_ptr<MappedScop>(new MappedScop(
74-
std::move(scop), ::tc::Grid{1, 1, 1}, ::tc::Block{1, 1, 1}, 1));
76+
std::move(scop), ::tc::Grid{1, 1, 1}, ::tc::Block{1, 1, 1}, 1, false));
7577
}
7678
static inline std::unique_ptr<MappedScop> makeMappedScop(
7779
std::unique_ptr<Scop>&& scop,
7880
::tc::Grid grid,
7981
::tc::Block block,
80-
uint64_t unroll) {
82+
uint64_t unroll,
83+
bool useReadOnlyCache) {
8184
return std::unique_ptr<MappedScop>(
82-
new MappedScop(std::move(scop), grid, block, unroll));
85+
new MappedScop(std::move(scop), grid, block, unroll, useReadOnlyCache));
8386
}
8487

8588
// Apply the hand-written OuterBlockInnerThread mapping strategy.
@@ -206,6 +209,7 @@ class MappedScop {
206209
const ::tc::Grid numBlocks;
207210
const ::tc::Block numThreads;
208211
const uint64_t unroll;
212+
const bool useReadOnlyCache;
209213

210214
private:
211215
// Information about a detected reduction that can potentially

0 commit comments

Comments
 (0)