Skip to content
This repository was archived by the owner on Apr 28, 2023. It is now read-only.

Commit 9155a07

Browse files
committed
promoteToRegistersBelow: limit the number of registers to use
Introduce the per-thread limit on the total number of registers to use during promotion. This limit does not differentiate between the data types because we cannot control the register allocation at CUDA level anyway. It rather serves as a controllable input to the promotion heuristic.
1 parent 1e5ad91 commit 9155a07

File tree

2 files changed

+23
-3
lines changed

2 files changed

+23
-3
lines changed

tc/core/polyhedral/cuda/memory_promotion_heuristic.cc

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -630,8 +630,17 @@ void promoteToSharedAtDepth(
630630
* of "mscop". Throw if promotion would violate the well-formedness of the
631631
* schedule tree, in particular in cases of promotion immediately below
632632
* a set/sequence node or immediately above a thread-specific marker node.
633+
* Promote at most "maxElements" elements per thread and return the difference
634+
* between "maxElements" and the number of actuall promoted elements. Note
635+
* that this function does not differentitate types and sizes of the promoted
636+
* elements because register allocation cannot be controlled at the CUDA level
637+
* anyway. Instead, the "maxElements" value controls how much register
638+
* promotion is performed overall.
633639
*/
634-
void promoteToRegistersBelow(MappedScop& mscop, detail::ScheduleTree* scope) {
640+
size_t promoteToRegistersBelow(
641+
MappedScop& mscop,
642+
detail::ScheduleTree* scope,
643+
size_t maxElements) {
635644
// Cannot promote below a sequence or a set node. Promotion may insert an
636645
// extension node, but sequence/set must be followed by filters.
637646
if (scope->as<detail::ScheduleTreeSequence>() ||
@@ -684,6 +693,12 @@ void promoteToRegistersBelow(MappedScop& mscop, detail::ScheduleTree* scope) {
684693
if (sizes.size() == 0) {
685694
continue;
686695
}
696+
// Do not promote if requires more registers than remaining.
697+
auto nElements = std::accumulate(
698+
sizes.begin(), sizes.end(), 1u, std::multiplies<size_t>());
699+
if (nElements > maxElements) {
700+
continue;
701+
}
687702
if (!isPromotableToRegistersBelow(
688703
*group, root, scope, partialSchedMupa, threadSchedule)) {
689704
continue;
@@ -703,13 +718,14 @@ void promoteToRegistersBelow(MappedScop& mscop, detail::ScheduleTree* scope) {
703718
std::move(group),
704719
scope,
705720
partialSched);
721+
maxElements -= nElements;
706722
}
707723
}
708724

709725
// Return immediately if nothing was promoted.
710726
if (scope->numChildren() == 0 ||
711727
!matchOne(extension(sequence(any())), scope->child({0}))) {
712-
return;
728+
return maxElements;
713729
}
714730

715731
// If promoting above thread mapping, insert synchronizations.
@@ -725,6 +741,7 @@ void promoteToRegistersBelow(MappedScop& mscop, detail::ScheduleTree* scope) {
725741
if (functional::Filter(isMappingTo<mapping::ThreadId>, ancestors).empty()) {
726742
scop.insertSyncsAroundSeqChildren(scope->child({0, 0}));
727743
}
744+
return maxElements;
728745
}
729746

730747
/*

tc/core/polyhedral/cuda/memory_promotion_heuristic.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,10 @@ void promoteToSharedAtDepth(
4141
std::size_t sharedMemorySize,
4242
bool unrollCopies);
4343

44-
void promoteToRegistersBelow(MappedScop& mscop, detail::ScheduleTree* scope);
44+
size_t promoteToRegistersBelow(
45+
MappedScop& mscop,
46+
detail::ScheduleTree* scope,
47+
std::size_t maxElements = SIZE_MAX);
4548

4649
void promoteToRegistersAtDepth(MappedScop& scop, std::size_t depth);
4750

0 commit comments

Comments
 (0)