Skip to content
This repository was archived by the owner on Apr 28, 2023. It is now read-only.

Commit 04ba6c4

Browse files
author
Sven Verdoolaege
committed
insert reduction synchronization immediately after mapping to threads
This brings the synchronization introduction closer to where the reduction band member is mapped to threads, reducing the risk that anything could go wrong in between. This should also make the code easier to follow.
1 parent a7ae6b8 commit 04ba6c4

File tree

2 files changed

+24
-25
lines changed

2 files changed

+24
-25
lines changed

tc/core/polyhedral/cuda/mapped_scop.cc

Lines changed: 17 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -338,11 +338,12 @@ size_t MappedScop::mapToThreads(detail::ScheduleTree* band) {
338338
// then the innermost of those will be used.
339339
auto nCanMap = bandNode->nOuterCoincident();
340340

341+
auto isReduction = reductionBandUpdates_.count(band) == 1;
341342
// If the band has a detected reduction, then the first member
342343
// after the coincident members is the reduction member and
343344
// this member has to be mapped as well.
344345
// In particular, it will get mapped to threadIdx.x
345-
if (reductionBandUpdates_.count(band) == 1) {
346+
if (isReduction) {
346347
CHECK(reductionBandUpdates_.at(band).separated);
347348
nCanMap++;
348349
}
@@ -370,6 +371,10 @@ size_t MappedScop::mapToThreads(detail::ScheduleTree* band) {
370371
band = map(band, dim, id);
371372
}
372373

374+
if (isReduction) {
375+
splitOutReductionAndInsertSyncs(band, nCanMap - 1);
376+
}
377+
373378
return nMappedThreads;
374379
}
375380

@@ -585,19 +590,16 @@ std::tuple<std::string, tc::Grid, tc::Block> MappedScop::codegen(
585590
mappedScopForCodegen->numThreads);
586591
}
587592

588-
// Split out reduction loops into separate bands and insert reduction
589-
// synchronizations outside those bands.
590-
void MappedScop::splitOutReductionsAndInsertSyncs() {
593+
// Split out reduction member at position "dim" in "band" and
594+
// insert reduction synchronizations outside this split off band.
595+
void MappedScop::splitOutReductionAndInsertSyncs(
596+
detail::ScheduleTree* band,
597+
int dim) {
591598
using namespace polyhedral::detail;
592599

593-
for (auto bandUpdate : reductionBandUpdates_) {
594-
auto tree = bandSplitOut(
595-
scop_->scheduleRoot(),
596-
const_cast<ScheduleTree*>(bandUpdate.first),
597-
bandUpdate.second.reductionDim);
598-
for (auto updateId : bandUpdate.second.ids) {
599-
scop_->insertReductionSync1D(tree, updateId);
600-
}
600+
auto tree = bandSplitOut(scop_->scheduleRoot(), band, dim);
601+
for (auto updateId : reductionBandUpdates_.at(band).ids) {
602+
scop_->insertReductionSync1D(tree, updateId);
601603
}
602604
}
603605

@@ -664,13 +666,7 @@ std::unique_ptr<MappedScop> MappedScop::makeWithOuterBlockInnerThreadStrategy(
664666
LOG_IF(INFO, FLAGS_debug_tc_mapper) << "After mapping to blocks:" << std::endl
665667
<< *mappedScop->schedule();
666668

667-
// 7. Insert reduction synchronizations if necessary.
668-
mappedScop->splitOutReductionsAndInsertSyncs();
669-
LOG_IF(INFO, FLAGS_debug_tc_mapper)
670-
<< "After inserting reduction synchronization:" << std::endl
671-
<< *mappedScop->schedule();
672-
673-
// 8. Promote to shared memory below the loops mapped to blocks.
669+
// 7. Promote to shared memory below the loops mapped to blocks.
674670
// This may split the outer band, so find the new outer band after promotion.
675671
if (cudaOptions.proto().use_shared_memory()) {
676672
size_t sharedMemorySize = cudaOptions.proto().has_max_shared_memory()
@@ -717,13 +713,13 @@ std::unique_ptr<MappedScop> MappedScop::makeWithOuterBlockInnerThreadStrategy(
717713
}
718714
}
719715

720-
// 9. Promote to registers below the loops mapped to threads.
716+
// 8. Promote to registers below the loops mapped to threads.
721717
if (cudaOptions.proto().use_private_memory()) {
722718
promoteToRegistersBelowThreads(
723719
mappedScop->scop(), mappedScop->threadIdxXScheduleDepthState, -1ull);
724720
}
725721

726-
// 10. Insert mapping context
722+
// 9. Insert mapping context
727723
mappedScop->insertMappingContext();
728724
LOG_IF(INFO, FLAGS_debug_tc_mapper)
729725
<< "After outerBlockInnerThread strategy:" << std::endl

tc/core/polyhedral/cuda/mapped_scop.h

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -155,13 +155,16 @@ class MappedScop {
155155
// The remaining parts, if any, are no longer considered for replacement
156156
// by a library call.
157157
detail::ScheduleTree* separateReduction(detail::ScheduleTree* band);
158-
// Split out reduction bands and insert reduction synchronizations.
159-
void splitOutReductionsAndInsertSyncs();
158+
// Split out reduction member at position "dim" in "band" and
159+
// insert reduction synchronizations.
160+
void splitOutReductionAndInsertSyncs(detail::ScheduleTree* band, int dim);
160161
// Map "band" to thread identifiers using as many blockSizes values as outer
161-
// coincident dimensions (plus reduction dimension, if any), and
162+
// coincident dimensions (plus reduction dimension, if any),
163+
// insert synchronization in case of a reduction, and
162164
// return the number of mapped thread identifiers.
163165
size_t mapToThreads(detail::ScheduleTree* band);
164-
// Map innermost bands to thread identifiers and
166+
// Map innermost bands to thread identifiers,
167+
// inserting synchronization in case of a reduction, and
165168
// return the number of mapped thread identifiers.
166169
size_t mapInnermostBandsToThreads(detail::ScheduleTree* st);
167170

0 commit comments

Comments
 (0)