Skip to content
This repository was archived by the owner on Apr 28, 2023. It is now read-only.

Commit a3ad5b0

Browse files
committed
register promotion: promote groups with more than one element
Relax the restriction for a promoted group to contain only one element. Instead, check that all only the dimensions that either correspond to unrolled loops or are mapped to blocks or threads appear in the access subscripts. During code generation, the corresponding loop iterators are either replaced by constants due to unrolling or become implicit in block/thread-specific groups (i.e., each thread in each block accesses different elements so it can use the same "address" for them).
1 parent 518a7a9 commit a3ad5b0

File tree

3 files changed

+105
-23
lines changed

3 files changed

+105
-23
lines changed

tc/core/polyhedral/cuda/mapped_scop.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1068,7 +1068,7 @@ std::unique_ptr<MappedScop> MappedScop::makeWithOuterBlockInnerThreadStrategy(
10681068

10691069
// 9. Promote to registers below the loops mapped to threads.
10701070
if (cudaOptions.proto().use_private_memory()) {
1071-
promoteToRegistersBelowThreads(mappedScop->scop(), -1ull);
1071+
promoteToRegistersBelowThreads(*mappedScop, -1ull);
10721072
}
10731073

10741074
LOG_IF(INFO, FLAGS_debug_tc_mapper)

tc/core/polyhedral/cuda/memory_promotion_heuristic.cc

Lines changed: 103 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -346,35 +346,118 @@ isl::union_set collectMappingsTo(const Scop& scop) {
346346
return mapping;
347347
}
348348

349+
/*
350+
* Check that only unrolled loops may appear in access subscripts.
351+
* Because the scoping point can be above a branching tree, descend into each
352+
* leaf of the subtree below the scoping point. For each leaf, construct an
353+
* affine multi-expression containing only those band members between the
354+
* scoping point and the leaf that are fully unrolled.
355+
*
356+
* Within each instance of the scope loops, check that loops that are either
357+
* unrolled or mapped to threads access a single tensor element in the group
358+
* (other loop indices will then not appear in the subscripts, making register
359+
* promotion possible). In other words, check that the relation between the
360+
* flat product of prefix, thread-mapped, and unrolled loop indices and
361+
* accessed elements is single-valued.
362+
*
363+
* If band members are mapped to blocks(threads), they may still correspond to
364+
* loops in the code in cases where the number of blocks(threads) is less than
365+
* the extent of the band member. If there is no "unroll" flag on these
366+
* members, we require that they not appear in the access subscripts similarly
367+
* to regular loops. This is slightly more conservative than necessary because
368+
* the actual generated loop iterators may disappear from the access after
369+
* mapping to threads in cases where they are used with a modulo that is less
370+
* than the number of blocks(threads). Precise analysis requires non-trivial
371+
* schedule manipulations or explicit tiling by grid(block) sizes before
372+
* mapping to blocks(threads).
373+
*
374+
* TODO: note that if a group is formed from partially overlapping references,
375+
* one must consider per-reference access relation for single-valuedness as
376+
* different references may have different values, but all of them remain
377+
* independent of non-unrolled loop iterators.
378+
*/
379+
bool accessSubscriptsAreUnrolledLoops(
380+
const TensorReferenceGroup& group,
381+
const detail::ScheduleTree* root,
382+
const detail::ScheduleTree* scope,
383+
isl::multi_union_pw_aff outerSchedule) {
384+
using namespace detail;
385+
386+
auto nodes = ScheduleTree::collect(scope);
387+
auto leaves = functional::Filter(
388+
[](const ScheduleTree* tree) { return tree->numChildren() == 0; }, nodes);
389+
390+
auto domainNode = root->elemAs<detail::ScheduleTreeElemDomain>();
391+
TC_CHECK(domainNode);
392+
auto domain = domainNode->domain_;
393+
394+
// Descend into every leaf.
395+
for (auto leaf : leaves) {
396+
auto ancestors = leaf->ancestors(root);
397+
ancestors.push_back(leaf);
398+
auto subdomain = activeDomainPointsBelow(root, leaf);
399+
400+
auto unrolledDims = isl::union_pw_aff_list(leaf->ctx_, 1);
401+
for (auto node : ancestors) {
402+
auto band = node->elemAs<detail::ScheduleTreeElemBand>();
403+
if (!band) {
404+
continue;
405+
}
406+
407+
isl::multi_union_pw_aff schedule = band->mupa_;
408+
schedule = schedule.intersect_domain(subdomain);
409+
for (size_t i = 0, e = band->nMember(); i < e; ++i) {
410+
if (!band->unroll_[i]) {
411+
continue;
412+
}
413+
unrolledDims = unrolledDims.add(schedule.get_union_pw_aff(i));
414+
}
415+
}
416+
417+
auto space = isl::space(leaf->ctx_, 0, unrolledDims.n())
418+
.align_params(subdomain.get_space());
419+
auto unrolledDimsMupa = isl::multi_union_pw_aff(space, unrolledDims);
420+
421+
// It is possible that no loops are unrolled, in which case
422+
// unrolledDimsMupa is zero-dimensional and needs an explicit domain
423+
// to be convertible to a union_map.
424+
unrolledDimsMupa =
425+
unrolledDimsMupa.intersect_domain(group.originalAccesses().domain());
426+
427+
auto accesses = group.originalAccesses();
428+
auto schedule = outerSchedule.flat_range_product(unrolledDimsMupa);
429+
accesses = accesses.apply_domain(isl::union_map::from(schedule));
430+
431+
if (!accesses.is_single_valued()) {
432+
return false;
433+
}
434+
}
435+
436+
return true;
437+
}
438+
349439
/*
350440
* Check if the given "group" can be promoted to registers for the given
351441
* mapping to thread identifiers and within the given outer schedule.
352442
*
353-
* In particular, the group's footprint must contain only one element and the
443+
* In particular, all tensor subscripts that may appear in the promoted access
444+
* must be either unrolled loops or thread identifiers and the
354445
* same tensor element should never be accessed by two different threads
355446
* within the same iteration of the outer schedule.
356447
* The second test is performed by checking that there is only a single
357448
* thread associated to a given pair of tensor element and outer schedule
358449
* iteration.
359-
* Note that the test for a single thread is performed by looking
360-
* at the range of "thread". This range may be larger than the number
361-
* of threads, such that multiple instances may get mapped to the same thread.
362-
* Requiring different such instances is therefore slightly more conservative
363-
* than strictly needed.
364450
*/
365-
bool isPromotableToRegisterBelowThreads(
451+
bool isPromotableToRegistersBelow(
366452
const TensorReferenceGroup& group,
453+
const detail::ScheduleTree* root,
454+
const detail::ScheduleTree* scope,
367455
isl::multi_union_pw_aff outer,
368456
isl::multi_union_pw_aff thread) {
369457
auto originalAccesses = group.originalAccesses();
370458

371-
// Return early if more than one element needs to be stored in registers.
372-
// TODO: support arrays in registers if they are only accessed with constant
373-
// subscripts, e.g. if the inner loops are fully unrolled.
374-
auto sizes = group.approximationSizes();
375-
auto nElements =
376-
std::accumulate(sizes.begin(), sizes.end(), 1, std::multiplies<size_t>());
377-
if (nElements != 1) {
459+
if (!accessSubscriptsAreUnrolledLoops(
460+
group, root, scope, outer.flat_range_product(thread))) {
378461
return false;
379462
}
380463

@@ -567,21 +650,20 @@ void promoteGreedilyAtDepth(
567650
}
568651

569652
// Promote at the positions of the thread specific markers.
570-
void promoteToRegistersBelowThreads(Scop& scop, size_t nRegisters) {
653+
void promoteToRegistersBelowThreads(MappedScop& mscop, size_t nRegisters) {
571654
using namespace tc::polyhedral::detail;
572655

656+
auto& scop = mscop.scop();
573657
auto root = scop.scheduleRoot();
658+
auto threadMapping = mscop.threadMappingSchedule(root);
574659

575660
{
576661
auto markers = findThreadSpecificMarkers(root);
577662

578663
for (auto marker : markers) {
579664
auto partialSched = prefixSchedule(root, marker);
580665
// Pure affine schedule without (mapping) filters.
581-
auto mapping = findThreadMappingAncestor(root, marker);
582-
auto prefixSchedMupa = prefixScheduleMupa(root, mapping);
583-
auto mapSchedMupa = infixScheduleMupa(root, mapping, marker);
584-
auto partialSchedMupa = prefixSchedMupa.flat_range_product(mapSchedMupa);
666+
auto partialSchedMupa = partialScheduleMupa(root, marker);
585667

586668
// Because this function is called below the thread mapping marker,
587669
// partialSched has been intersected with both the block and the thread
@@ -600,8 +682,8 @@ void promoteToRegistersBelowThreads(Scop& scop, size_t nRegisters) {
600682
if (sizes.size() == 0) {
601683
continue;
602684
}
603-
if (!isPromotableToRegisterBelowThreads(
604-
*group, prefixSchedMupa, mapSchedMupa)) {
685+
if (!isPromotableToRegistersBelow(
686+
*group, root, marker, partialSchedMupa, threadMapping)) {
605687
continue;
606688
}
607689
if (!hasReuseWithin(*group, partialSchedMupa)) {

tc/core/polyhedral/cuda/memory_promotion_heuristic.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,6 @@ void promoteGreedilyAtDepth(
3838
std::size_t sharedMemorySize,
3939
bool unrollCopies);
4040

41-
void promoteToRegistersBelowThreads(Scop& scop, std::size_t nRegisters);
41+
void promoteToRegistersBelowThreads(MappedScop& scop, std::size_t nRegisters);
4242
} // namespace polyhedral
4343
} // namespace tc

0 commit comments

Comments
 (0)