Skip to content
This repository was archived by the owner on Apr 28, 2023. It is now read-only.

Commit 184524f

Browse files
committed
TensorReferenceGroup: replace accessedBySubtree with accessedWithin
This allows for handling block/thread mappings in a more transparent way. In particular, computed groups can be made specific to blocks and/or threads even if they are scoped above the block or thread mapping by intersecting the domain of the schedule with the mapping filters. Until now, it was done implicitly by partial schedule computation functions that intersect the domain of the schedule with the filters located above it in the tree. This required promotion functions to be called below block mapping for shared memory and below thread mapping for registers.
1 parent b9550b7 commit 184524f

File tree

5 files changed

+62
-20
lines changed

5 files changed

+62
-20
lines changed

tc/core/polyhedral/cuda/memory_promotion_heuristic.cc

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -468,11 +468,13 @@ void promoteToSharedGreedy(
468468
// both.
469469
size_t remainingMemory = maxMemory;
470470
for (auto bandNode : bands) {
471-
auto groupMap = TensorReferenceGroup::accessedBySubtree(bandNode, scop);
471+
auto activePoints = activeDomainPoints(root, bandNode);
472472
auto partialSched = partialSchedule(root, bandNode);
473+
474+
auto groupMap = TensorReferenceGroup::accessedWithin(
475+
partialSched.intersect_domain(activePoints), scop.reads, scop.writes);
473476
// Pure affine schedule without (mapping) filters.
474477
auto partialSchedMupa = partialScheduleMupa(root, bandNode);
475-
auto activePoints = activeDomainPoints(root, bandNode);
476478

477479
// Prepare groups for sorting, to have specified order necessary for
478480
// reproducibility and tests.
@@ -581,7 +583,12 @@ void promoteToRegistersBelowThreads(Scop& scop, size_t nRegisters) {
581583
auto mapSchedMupa = infixScheduleMupa(root, mapping, marker);
582584
auto partialSchedMupa = prefixSchedMupa.flat_range_product(mapSchedMupa);
583585

584-
auto groupMap = TensorReferenceGroup::accessedBySubtree(marker, scop);
586+
// Because this function is called below the thread mapping marker,
587+
// partialSched has been intersected with both the block and the thread
588+
// mapping filters. Therefore, groups will be computed relative to
589+
// blocks and threads.
590+
auto groupMap = TensorReferenceGroup::accessedWithin(
591+
partialSched, scop.reads, scop.writes);
585592
for (auto& tensorGroups : groupMap) {
586593
auto tensorId = tensorGroups.first;
587594

tc/core/polyhedral/memory_promotion.cc

Lines changed: 28 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -326,17 +326,38 @@ void addSingletonReferenceGroups(
326326
}
327327
} // namespace
328328

329-
TensorGroups TensorReferenceGroup::accessedBySubtree(
330-
const ScheduleTree* tree,
331-
const Scop& scop) {
329+
// Compute tensor reference groups encapsulating all tensor accesses within
330+
// "outerSchedule". Only statement instances present in the domain of
331+
// "outerSchedule" are considered. In particular, if this domain is
332+
// intersected with block and/or thread mapping, the reference groups are
333+
// computed inside one block and/or thread, even if "outerSchedule" does not
334+
// include band members mapped to blocks and/or threads.
335+
//
336+
// Tensor reference descriptors (TensorReference) contain information about
337+
// tensor elements accessed through the given reference within "outerSchedule".
338+
// Several references form a group (TensorReferenceGroup) if the same elements
339+
// may be accessed through these references, and at least one of the accesses
340+
// writes to the element. A group stores a rectangular overapproximation of
341+
// the set of accessed tensor elements (access footprint). This
342+
// overappoximation can be used to create copies of the given tensor elements
343+
// in another memory space, i.e., to perform memory promotion. If the domain
344+
// of "outerSchedule" included thread or block mapping, then the
345+
// overappoximation is computed per-block or per-thread.
346+
//
347+
// Returns a map between tensor ids and vectors of unique pointers to
348+
// TensorReferenceGroup, with each group potentially containing multiple
349+
// references.
350+
TensorGroups TensorReferenceGroup::accessedWithin(
351+
isl::union_map outerSchedule,
352+
isl::union_map reads,
353+
isl::union_map writes) {
332354
TensorGroups tensorGroups;
333-
auto domain = activeDomainPoints(scop.scheduleRoot(), tree);
334-
auto schedule = partialSchedule(scop.scheduleRoot(), tree);
355+
auto domain = outerSchedule.domain();
335356

336357
addSingletonReferenceGroups(
337-
tensorGroups, scop.writes, domain, schedule, AccessType::Write);
358+
tensorGroups, writes, domain, outerSchedule, AccessType::Write);
338359
addSingletonReferenceGroups(
339-
tensorGroups, scop.reads, domain, schedule, AccessType::Read);
360+
tensorGroups, reads, domain, outerSchedule, AccessType::Read);
340361

341362
// For each tensor, join groups whose footprints overlap and at least one
342363
// access is a write. Do not join between tensors because no aliasing.

tc/core/polyhedral/memory_promotion.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -111,9 +111,10 @@ class TensorReferenceGroup {
111111
TensorReferenceGroup() {}
112112

113113
public:
114-
static TensorGroups accessedBySubtree(
115-
const detail::ScheduleTree* tree,
116-
const Scop& scop);
114+
static TensorGroups accessedWithin(
115+
isl::union_map outerSchedule,
116+
isl::union_map reads,
117+
isl::union_map writes);
117118

118119
bool isReadOnly() const;
119120

tc/core/polyhedral/scop.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -249,7 +249,7 @@ void Scop::promoteEverythingAt(std::vector<size_t> pos) {
249249
checkFiltersDisjointStatements(scheduleRoot());
250250
auto schedule = partialSchedule(root, tree);
251251

252-
auto groupMap = TensorReferenceGroup::accessedBySubtree(tree, *this);
252+
auto groupMap = TensorReferenceGroup::accessedWithin(schedule, reads, writes);
253253
for (auto& p : groupMap) {
254254
for (auto& gr : p.second) {
255255
promoteGroup(

test/test_cuda_mapper_memory_promotion.cc

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,21 @@ class TestMapper : public ::testing::Test {
5151
return MappedScop::makeWithOuterBlockInnerThreadStrategy(
5252
std::move(scop), mappingOptions);
5353
}
54+
55+
// This mimics behavior of the old TensorReferenceGroup::accessedBySubtree.
56+
// In particular, it takes the partial schedule until "tree" inclusive,
57+
// intersecting its domain with all (mapping) filter ancestors and computes
58+
// accessed tensor reference groups within that schedule.
59+
// If the schedule happens to contain the block/thread mapping filter, the
60+
// groups are per-block/thread. Otherwise, they include all blocks/threads,
61+
// which is questionable, but corresponds to what the test currently checks.
62+
TensorGroups accessedBySubtree(
63+
const polyhedral::detail::ScheduleTree* tree,
64+
const Scop& scop) {
65+
auto schedule = partialSchedule(scop.scheduleRoot(), tree);
66+
return TensorReferenceGroup::accessedWithin(
67+
schedule, scop.reads, scop.writes);
68+
}
5469
};
5570

5671
class MapperMemoryPromotion2DHelper : public TestMapper {
@@ -253,8 +268,7 @@ def fun(float(N, M) A, float(N, M) B) -> (C) {
253268
// Must force domain intersection for overapproximation to work
254269
scop.specializeToContext();
255270
auto ctx = scop.domain().get_ctx();
256-
auto groups = TensorReferenceGroup::accessedBySubtree(
257-
scop.scheduleRoot()->child(childPos), scop);
271+
auto groups = accessedBySubtree(scop.scheduleRoot()->child(childPos), scop);
258272
LOG(INFO) << "Groups:\n" << groups;
259273

260274
EXPECT_EQ(groups.size(), 3u);
@@ -333,8 +347,7 @@ def fun(float(N, M) A) -> (B, C) {
333347
// Must force domain intersection for overapproximation to work
334348
scop.specializeToContext();
335349
auto ctx = scop.domain().get_ctx();
336-
auto groups = TensorReferenceGroup::accessedBySubtree(
337-
scop.scheduleRoot()->child(childPos), scop);
350+
auto groups = accessedBySubtree(scop.scheduleRoot()->child(childPos), scop);
338351
LOG(INFO) << "Groups:\n" << groups;
339352

340353
ASSERT_EQ(groups.size(), 3u);
@@ -521,8 +534,8 @@ class Strided : public TestMapper {
521534
auto& scop = mscop->scop();
522535
auto ctx = scop.domain().get_ctx();
523536

524-
auto groups = TensorReferenceGroup::accessedBySubtree(
525-
scop.scheduleRoot()->child({0, 0, 0}), scop);
537+
auto groups =
538+
accessedBySubtree(scop.scheduleRoot()->child({0, 0, 0}), scop);
526539
EXPECT_EQ(groups.size(), 2u) << "expected groups for both tensors";
527540

528541
for (const auto& g : groups) {

0 commit comments

Comments
 (0)