Skip to content
This repository was archived by the owner on Apr 28, 2023. It is now read-only.

Commit 31fd67b

Browse files
committed
register promotion: promote below any node in the tree
Extend promoteToRegistersBelow to take any schedule tree node as a scoping point and attempt register promotion below that node. Independently of the promotion scope, tensor reference groups must be computed per-thread (and per-block). Include the thread (and block) mapping in the domain of the outer schedule when computing the groups. Since registers are private to a thread, different threads should not access the same value within the given scope. Inside the promotion scope, there may be multiple subtrees with different thread mappings. Furthermore, these mappings are not included in the prefix (scope) schedule. Always include thread schedule when checking if accesses are performed from one thread only. This schedule takes into account potentially different mappings. Compute reuse within threads by appending thread schedule to the prefix schedule before checking for reuse.
1 parent f8b49cc commit 31fd67b

File tree

4 files changed

+183
-35
lines changed

4 files changed

+183
-35
lines changed

tc/core/polyhedral/cuda/memory_promotion_heuristic.cc

Lines changed: 48 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -649,27 +649,55 @@ void promoteGreedilyAtDepth(
649649
mapCopiesToThreads(mscop, unrollCopies);
650650
}
651651

652-
namespace {
653-
654652
/*
655-
* Perform promotion to registers below the thread specific marker "marker"
656-
* in the schedule tree of "mscop".
653+
* Perform promotion to registers below the node "scope" in the schedule tree
654+
* of "mscop". Throw if promotion would violate the well-formedness of the
655+
* schedule tree, in particular in cases of promotion immediately below
656+
* a set/sequence node or immediately above a thread-specific marker node.
657657
*/
658-
void promoteToRegistersBelow(MappedScop& mscop, detail::ScheduleTree* marker) {
658+
void promoteToRegistersBelow(MappedScop& mscop, detail::ScheduleTree* scope) {
659+
// Cannot promote below a sequence or a set node. Promotion may insert an
660+
// extension node, but sequence/set must be followed by filters.
661+
if (scope->elemAs<detail::ScheduleTreeElemSequence>() ||
662+
scope->elemAs<detail::ScheduleTreeElemSet>()) {
663+
throw promotion::IncorrectScope("cannot promote under a sequence/set node");
664+
}
665+
// Cannot promote between a thread-mapped band and a thread-specific marker
666+
// node because the latter is used to identify thread-mapped bands as
667+
// immediate ancestors.
668+
if (scope->numChildren() == 1 &&
669+
scope->child({0})
670+
->elemAs<detail::ScheduleTreeElemThreadSpecificMarker>()) {
671+
throw promotion::IncorrectScope(
672+
"cannot promote above a thread-specific marker node");
673+
}
674+
659675
auto& scop = mscop.scop();
660676
auto root = scop.scheduleRoot();
661-
auto threadMapping = mscop.threadMappingSchedule(root);
662677

663-
auto partialSched = prefixSchedule(root, marker);
678+
// Compute groups specific to threads and block by including the mappings
679+
// into the domain of the partials schedule.
680+
auto blockMapping = collectMappingsTo<mapping::BlockId>(scop);
681+
auto mapping =
682+
collectMappingsTo<mapping::ThreadId>(scop).intersect(blockMapping);
683+
auto schedule = partialSchedule(scop.scheduleRoot(), scope);
684+
auto groupMap = TensorReferenceGroup::accessedWithin(
685+
schedule.intersect_domain(mapping), scop.reads, scop.writes);
686+
687+
auto threadSchedule = mscop.threadMappingSchedule(mscop.schedule());
688+
auto blockSchedule = mscop.blockMappingSchedule(mscop.schedule());
689+
664690
// Pure affine schedule without (mapping) filters.
665-
auto partialSchedMupa = partialScheduleMupa(root, marker);
691+
auto partialSchedMupa = partialScheduleMupa(root, scope);
692+
// Schedule with block mapping filter.
693+
auto partialSched =
694+
isl::union_map::from(partialSchedMupa).intersect_domain(blockMapping);
695+
// The following promotion validity and profitability checks need to be
696+
// performed with respect to the block mapping, so append the block schedule.
697+
// If the partial schedule contains it already, it will just end up with
698+
// identical dimensions without affecting the result of the checks.
699+
partialSchedMupa = partialSchedMupa.flat_range_product(blockSchedule);
666700

667-
// Because this function is called below the thread mapping marker,
668-
// partialSched has been intersected with both the block and the thread
669-
// mapping filters. Therefore, groups will be computed relative to
670-
// blocks and threads.
671-
auto groupMap = TensorReferenceGroup::accessedWithin(
672-
partialSched, scop.reads, scop.writes);
673701
for (auto& tensorGroups : groupMap) {
674702
auto tensorId = tensorGroups.first;
675703

@@ -682,27 +710,28 @@ void promoteToRegistersBelow(MappedScop& mscop, detail::ScheduleTree* marker) {
682710
continue;
683711
}
684712
if (!isPromotableToRegistersBelow(
685-
*group, root, marker, partialSchedMupa, threadMapping)) {
713+
*group, root, scope, partialSchedMupa, threadSchedule)) {
686714
continue;
687715
}
688-
if (!hasReuseWithin(*group, partialSchedMupa)) {
716+
// Check reuse within threads.
717+
auto schedule = partialSchedMupa.flat_range_product(threadSchedule);
718+
if (!hasReuseWithin(*group, schedule)) {
689719
continue;
690720
}
721+
691722
// TODO: if something is already in shared, but reuse it within one
692723
// thread only, there is no point in keeping it in shared _if_ it
693724
// gets promoted into a register.
694725
scop.promoteGroup(
695726
Scop::PromotedDecl::Kind::Register,
696727
tensorId,
697728
std::move(group),
698-
marker,
729+
scope,
699730
partialSched);
700731
}
701732
}
702733
}
703734

704-
} // namespace
705-
706735
// Promote at the positions of the thread specific markers.
707736
void promoteToRegistersBelowThreads(MappedScop& mscop, size_t nRegisters) {
708737
using namespace tc::polyhedral::detail;

tc/core/polyhedral/cuda/memory_promotion_heuristic.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,10 @@ namespace polyhedral {
2525
class MappedScop;
2626
class Scop;
2727

28+
namespace detail {
29+
class ScheduleTree;
30+
}
31+
2832
// In the given mapped scop "mscop",
2933
// promote to shared memory at "depth" until "sharedMemorySize" is used.
3034
// Map copies between global and shared memory to threads and unroll those
@@ -38,6 +42,8 @@ void promoteGreedilyAtDepth(
3842
std::size_t sharedMemorySize,
3943
bool unrollCopies);
4044

45+
void promoteToRegistersBelow(MappedScop& mscop, detail::ScheduleTree* scope);
46+
4147
void promoteToRegistersBelowThreads(MappedScop& scop, std::size_t nRegisters);
4248
} // namespace polyhedral
4349
} // namespace tc

tc/core/polyhedral/exceptions.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,10 @@ struct PromotionNYI : public std::logic_error {
6262
struct GroupingError : public std::logic_error {
6363
explicit GroupingError(const std::string& s) : std::logic_error(s) {}
6464
};
65+
66+
struct IncorrectScope : public std::logic_error {
67+
explicit IncorrectScope(const std::string& s) : std::logic_error(s) {}
68+
};
6569
} // namespace promotion
6670

6771
namespace codegen {

test/test_cuda_mapper_memory_promotion.cc

Lines changed: 125 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -451,8 +451,8 @@ TEST_F(MapperMemoryPromotionRAW, throwIfCopiesBelowThreads) {
451451
}
452452

453453
class MatMulBias : public TestMapper {
454-
public:
455-
std::string emitCode(
454+
protected:
455+
std::unique_ptr<MappedScop> prepare(
456456
const std::unordered_map<std::string, size_t>& parameters,
457457
const CudaMappingOptions& mappingOptions) {
458458
std::string tc = R"TC(
@@ -462,9 +462,45 @@ def fun(float(N,K) A, float(K,M) B, float(N,M) C) -> (O) {
462462
}
463463
)TC";
464464

465-
auto mscop = makeMappedScop(tc, mappingOptions, parameters);
465+
return makeMappedScop(tc, mappingOptions, parameters);
466+
}
467+
468+
std::string emitCode(const std::unique_ptr<MappedScop>& mscop) {
466469
return std::get<0>(mscop->codegen("fun"));
467470
}
471+
472+
std::string emitCode(
473+
const std::unordered_map<std::string, size_t>& parameters,
474+
const CudaMappingOptions& mappingOptions) {
475+
return emitCode(prepare(parameters, mappingOptions));
476+
}
477+
478+
void expectNoABCPromotion(const std::string& code) {
479+
auto aDeclPos = code.find(" float32 _A_0");
480+
auto bDeclPos = code.find(" float32 _B_0");
481+
auto cDeclPos = code.find(" float32 _C_0");
482+
EXPECT_TRUE(aDeclPos == std::string::npos)
483+
<< "tensor A promoted to register but has elements accessed "
484+
<< "by multiple threads";
485+
EXPECT_TRUE(bDeclPos == std::string::npos)
486+
<< "tensor B promoted to register but has elements accessed "
487+
<< "by multiple threads";
488+
EXPECT_TRUE(cDeclPos == std::string::npos)
489+
<< "tensor C promoted to register but has no reuse";
490+
}
491+
492+
void expectNoSymbolicSubscript(const std::string& code) {
493+
// We don't know the exact name of the iterator, but it starts with c.
494+
auto oWithIteratorPos = code.find("_O_0[c");
495+
auto oWithThreadPos = code.find("_O_0[t1");
496+
497+
EXPECT_TRUE(oWithIteratorPos == std::string::npos)
498+
<< "accessing local arrays with iterators in subscripts makes "
499+
<< "these arrays placed in local memory instead of registers";
500+
EXPECT_TRUE(oWithThreadPos == std::string::npos)
501+
<< "expected per-thread groups to be computed, i.e. thread "
502+
<< "identifiers should not appear in the subscripts";
503+
}
468504
};
469505

470506
TEST_F(MatMulBias, RegisterPromotion) {
@@ -482,8 +518,6 @@ TEST_F(MatMulBias, RegisterPromotion) {
482518

483519
auto originalAccPos =
484520
code.find("O[32 * b0 + c3][t0 + 32 * b1]", copyToPos + 1);
485-
auto cDeclPos = code.find("float32 _C_0");
486-
auto aDeclPos = code.find("float32 _A_0");
487521

488522
EXPECT_TRUE(declPos != std::string::npos) << "no declaration of the register";
489523
EXPECT_TRUE(copyToPos != std::string::npos) << "expected copy to register";
@@ -492,10 +526,8 @@ TEST_F(MatMulBias, RegisterPromotion) {
492526

493527
EXPECT_NE(originalAccPos, copyFromPos)
494528
<< "global array reference is used in main computation";
495-
EXPECT_TRUE(cDeclPos == std::string::npos)
496-
<< "tensor C promoted to register but has no reuse";
497-
EXPECT_TRUE(aDeclPos == std::string::npos)
498-
<< "tensor A promoted to register but has elements accessed by multiple threads";
529+
530+
expectNoABCPromotion(code);
499531
}
500532

501533
TEST_F(MatMulBias, RegisterPromotionSharedPreference) {
@@ -506,16 +538,93 @@ TEST_F(MatMulBias, RegisterPromotionSharedPreference) {
506538
.usePrivateMemory(true);
507539

508540
auto code = emitCode({{"N", 42}, {"M", 56}, {"K", 37}}, mappingOptions);
509-
auto declPos = code.find("float32 _O_0[1][1]");
510-
auto cDeclPos = code.find("float32 _C_0[1][1]");
511-
auto aDeclPos = code.find("float32 _A_0[1][1]");
512541

542+
auto declPos = code.find("float32 _O_0[1][1]");
513543
EXPECT_TRUE(declPos == std::string::npos)
514544
<< "not expected promotion to register because promoted to shared";
515-
EXPECT_TRUE(cDeclPos == std::string::npos)
516-
<< "tensor C promoted to register but has no reuse";
517-
EXPECT_TRUE(aDeclPos == std::string::npos)
518-
<< "tensor A promoted to register but has elements accessed by multiple threads";
545+
546+
expectNoABCPromotion(code);
547+
}
548+
549+
TEST_F(MatMulBias, RegistersAtRoot) {
550+
// Disable automatic promotion to registers because we are going to call it
551+
// manually. Require sufficient unrolling to actually hit registers.
552+
auto mappingOptions = CudaMappingOptions::makeNaiveMappingOptions()
553+
.unroll(512)
554+
.useSharedMemory(false)
555+
.usePrivateMemory(false);
556+
557+
auto mscop = prepare({{"N", 42}, {"M", 56}, {"K", 37}}, mappingOptions);
558+
promoteToRegistersBelow(*mscop, mscop->scop().scheduleRoot());
559+
auto code = emitCode(mscop);
560+
561+
// Expecting 4 elements because we map the loop i in O[i][j] to 8 threads
562+
// after tiling by 32.
563+
auto oDeclPos = code.find("float32 _O_0[4][1];");
564+
EXPECT_TRUE(oDeclPos != std::string::npos)
565+
<< "expected O to be promoted to registers";
566+
567+
expectNoABCPromotion(code);
568+
expectNoSymbolicSubscript(code);
569+
570+
auto o00Pos = code.find("_O_0[0][0]");
571+
auto o10Pos = code.find("_O_0[1][0]");
572+
auto o20Pos = code.find("_O_0[2][0]");
573+
auto o30Pos = code.find("_O_0[3][0]");
574+
575+
EXPECT_TRUE(o00Pos != std::string::npos)
576+
<< "expected constant subscripts in _O_0";
577+
EXPECT_TRUE(o10Pos != std::string::npos)
578+
<< "expected constant subscripts in _O_0";
579+
EXPECT_TRUE(o20Pos != std::string::npos)
580+
<< "expected constant subscripts in _O_0";
581+
EXPECT_TRUE(o30Pos != std::string::npos)
582+
<< "expected constant subscripts in _O_0";
583+
}
584+
585+
TEST_F(MatMulBias, RegistersAtRootNotEnoughUnroll) {
586+
// Disable automatic promotion to registers because we are going to call it
587+
// manually. Require no unrolling so as to make promotion to registers
588+
// invalid.
589+
auto mappingOptions = CudaMappingOptions::makeNaiveMappingOptions()
590+
.unroll(1)
591+
.useSharedMemory(false)
592+
.usePrivateMemory(false);
593+
594+
auto mscop = prepare({{"N", 42}, {"M", 56}, {"K", 37}}, mappingOptions);
595+
promoteToRegistersBelow(*mscop, mscop->scop().scheduleRoot());
596+
auto code = emitCode(mscop);
597+
auto oDeclPos = code.find("float32 _O_0;");
598+
599+
EXPECT_TRUE(oDeclPos == std::string::npos)
600+
<< "not expected O to be promoted to registers";
601+
602+
expectNoABCPromotion(code);
603+
expectNoSymbolicSubscript(code);
604+
}
605+
606+
TEST_F(MatMulBias, RegistersBelowFirstBand) {
607+
using namespace polyhedral::detail;
608+
609+
// Disable automatic promotion to registers because we are going to call it
610+
// manually.
611+
auto mappingOptions = CudaMappingOptions::makeNaiveMappingOptions()
612+
.useSharedMemory(false)
613+
.usePrivateMemory(false);
614+
auto mscop = prepare({{"N", 42}, {"M", 56}, {"K", 37}}, mappingOptions);
615+
616+
auto nodes = ScheduleTree::collectDFSPostorder(
617+
mscop->scop().scheduleRoot(), ScheduleTreeType::Band);
618+
ASSERT_GT(nodes.size(), 0u);
619+
auto node = nodes[0];
620+
promoteToRegistersBelow(*mscop, node);
621+
auto code = emitCode(mscop);
622+
623+
auto oDeclPos = code.find("float32 _O_0[1][1];");
624+
EXPECT_TRUE(oDeclPos != std::string::npos)
625+
<< "expected O to be promoted to registers";
626+
expectNoABCPromotion(code);
627+
expectNoSymbolicSubscript(code);
519628
}
520629

521630
class Strided : public TestMapper {

0 commit comments

Comments
 (0)