Skip to content
This repository was archived by the owner on Apr 28, 2023. It is now read-only.

Commit 5b448f0

Browse files
committed
merge promoteToSharedGreedy and promoteGreedilyAtDepth into one function
Arguably, it was a mistake to have separate functions in the first place. This led to situations in tests where the copies between global and shared memory were not mapped to threads. Merge promoteToSharedGreedy and promoteGreedilyAtDepth into a single function, promoteToSharedAtDepth. The name is chosen for consistency with promoteToRegistersAtDepth.
1 parent fe8f519 commit 5b448f0

File tree

4 files changed

+23
-27
lines changed

4 files changed

+23
-27
lines changed

tc/core/polyhedral/cuda/mapped_scop.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1079,7 +1079,7 @@ std::unique_ptr<MappedScop> MappedScop::makeWithOuterBlockInnerThreadStrategy(
10791079
!generic.proto.has_unroll())
10801080
<< "requested to unroll copies to shared memory without providing the unroll size";
10811081

1082-
promoteGreedilyAtDepth(
1082+
promoteToSharedAtDepth(
10831083
*mappedScop,
10841084
std::min(band->nOuterCoincident(), mappedScop->numBlocks.view.size()),
10851085
sharedMemorySize,

tc/core/polyhedral/cuda/memory_promotion_heuristic.cc

Lines changed: 20 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -540,6 +540,16 @@ void promoteToSharedBelow(
540540
scop.insertSyncsAroundCopies(node);
541541
}
542542

543+
/*
544+
* Check if "tree" is a band node mapped to threads. In particular, check that
545+
* "tree" is a band and a thread-specific node appears as its only child.
546+
*/
547+
inline bool isThreadMappedBand(const detail::ScheduleTree* tree) {
548+
return matchOne(band(threadSpecific(any())), tree) ||
549+
matchOne(band(threadSpecific()), tree);
550+
}
551+
} // namespace
552+
543553
/*
544554
* For every place in the schedule tree where schedule depth (i.e., the number
545555
* of preceding band members) is "depth", promote tensor reference groups to
@@ -550,14 +560,22 @@ void promoteToSharedBelow(
550560
*
551561
* Only promote if the tensor elements referenced by the group are reused or
552562
* accessed in a non-coalesced way.
563+
*
564+
* If "unrollCopies" is set, use the unroll factor from "mscop" to unroll the
565+
* loops that copy values from global to shared memory and back.
553566
*/
554-
void promoteToSharedGreedy(Scop& scop, size_t depth, size_t maxMemory) {
567+
void promoteToSharedAtDepth(
568+
MappedScop& mscop,
569+
size_t depth,
570+
size_t maxMemory,
571+
bool unrollCopies) {
555572
using namespace tc::polyhedral::detail;
556573

557574
if (depth == 0) {
558575
throw promotion::PromotionNYI("promotion before any band");
559576
}
560577

578+
auto& scop = mscop.scop();
561579
auto root = scop.scheduleRoot();
562580

563581
// 1. Collect all bands with a member located at the given depth in the
@@ -575,27 +593,8 @@ void promoteToSharedGreedy(Scop& scop, size_t depth, size_t maxMemory) {
575593
for (auto bandNode : bands) {
576594
promoteToSharedBelow(scop, bandNode, remainingMemory);
577595
}
578-
}
579-
580-
/*
581-
* Check if "tree" is a band node mapped to threads. In particular, check that
582-
* "tree" is a band and a thread-specific node appears as its only child.
583-
*/
584-
inline bool isThreadMappedBand(const detail::ScheduleTree* tree) {
585-
return matchOne(band(threadSpecific(any())), tree) ||
586-
matchOne(band(threadSpecific()), tree);
587-
}
588-
} // namespace
589-
590-
void promoteGreedilyAtDepth(
591-
MappedScop& mscop,
592-
size_t depth,
593-
size_t sharedMemorySize,
594-
bool unrollCopies) {
595-
// 1. Promote using heuristic.
596-
promoteToSharedGreedy(mscop.scop(), depth, sharedMemorySize);
597596

598-
// 2. Map copies to shared, state by copy
597+
// 3. Map copies to shared.
599598
mapCopiesToThreads(mscop, unrollCopies);
600599
}
601600

tc/core/polyhedral/cuda/memory_promotion_heuristic.h

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -33,10 +33,7 @@ class ScheduleTree;
3333
// promote to shared memory at "depth" until "sharedMemorySize" is used.
3434
// Map copies between global and shared memory to threads and unroll those
3535
// copies if "unrollCopies" is set, using the options in "mscop".
36-
// "threadIdxXScheduleDepthState" contains the schedule depth at which the
37-
// computation was mapped to thread x and is used to check whether the global
38-
// memory is accessed in a coalesced way.
39-
void promoteGreedilyAtDepth(
36+
void promoteToSharedAtDepth(
4037
MappedScop& scop,
4138
std::size_t depth,
4239
std::size_t sharedMemorySize,

test/test_cuda_mapper_memory_promotion.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -392,7 +392,7 @@ def fun(float(N, M) A) -> (B, C) {
392392
size_t maxSharedMemory) {
393393
auto mscop = prepareScop(
394394
tc, {{"N", problemSize1}, {"M", problemSize2}}, {tileSize1, tileSize2});
395-
promoteGreedilyAtDepth(*mscop, depth, maxSharedMemory, false);
395+
promoteToSharedAtDepth(*mscop, depth, maxSharedMemory, false);
396396
return mscop;
397397
}
398398
};

0 commit comments

Comments
 (0)