TensorReferenceGroup: replace accessedBySubtree with accessedWithin

ftynse · ftynse · commit 184524f98cf5 · 2018-06-15T11:12:26.000+02:00
This allows for handling block/thread mappings in a more transparent
way.  In particular, computed groups can be made specific to blocks
and/or threads even if they are scoped above the block or thread mapping
by intersecting the domain of the schedule with the mapping filters.

Until now, it was done implicitly by partial schedule computation
functions that intersect the domain of the schedule with the filters
located above it in the tree.  This required promotion functions to be
called below block mapping for shared memory and below thread mapping
for registers.
diff --git a/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc b/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc
@@ -468,11 +468,13 @@ void promoteToSharedGreedy(
   // both.
   size_t remainingMemory = maxMemory;
   for (auto bandNode : bands) {
-    auto groupMap = TensorReferenceGroup::accessedBySubtree(bandNode, scop);
+    auto activePoints = activeDomainPoints(root, bandNode);
     auto partialSched = partialSchedule(root, bandNode);
+
+    auto groupMap = TensorReferenceGroup::accessedWithin(
+        partialSched.intersect_domain(activePoints), scop.reads, scop.writes);
     // Pure affine schedule without (mapping) filters.
     auto partialSchedMupa = partialScheduleMupa(root, bandNode);
-    auto activePoints = activeDomainPoints(root, bandNode);
 
     // Prepare groups for sorting, to have specified order necessary for
     // reproducibility and tests.
@@ -581,7 +583,12 @@ void promoteToRegistersBelowThreads(Scop& scop, size_t nRegisters) {
       auto mapSchedMupa = infixScheduleMupa(root, mapping, marker);
       auto partialSchedMupa = prefixSchedMupa.flat_range_product(mapSchedMupa);
 
-      auto groupMap = TensorReferenceGroup::accessedBySubtree(marker, scop);
+      // Because this function is called below the thread mapping marker,
+      // partialSched has been intersected with both the block and the thread
+      // mapping filters.   Therefore, groups will be computed relative to
+      // blocks and threads.
+      auto groupMap = TensorReferenceGroup::accessedWithin(
+          partialSched, scop.reads, scop.writes);
       for (auto& tensorGroups : groupMap) {
         auto tensorId = tensorGroups.first;
 
diff --git a/tc/core/polyhedral/memory_promotion.cc b/tc/core/polyhedral/memory_promotion.cc
@@ -326,17 +326,38 @@ void addSingletonReferenceGroups(
 }
 } // namespace
 
-TensorGroups TensorReferenceGroup::accessedBySubtree(
-    const ScheduleTree* tree,
-    const Scop& scop) {
+// Compute tensor reference groups encapsulating all tensor accesses within
+// "outerSchedule".  Only statement instances present in the domain of
+// "outerSchedule" are considered.  In particular, if this domain is
+// intersected with block and/or thread mapping, the reference groups are
+// computed inside one block and/or thread, even if "outerSchedule" does not
+// include band members mapped to blocks and/or threads.
+//
+// Tensor reference descriptors (TensorReference) contain information about
+// tensor elements accessed through the given reference within "outerSchedule".
+// Several references form a group (TensorReferenceGroup) if the same elements
+// may be accessed through these references, and at least one of the accesses
+// writes to the element.  A group stores a rectangular overapproximation of
+// the set of accessed tensor elements (access footprint).  This
+// overappoximation can be used to create copies of the given tensor elements
+// in another memory space, i.e., to perform memory promotion.  If the domain
+// of "outerSchedule" included thread or block mapping, then the
+// overappoximation is computed per-block or per-thread.
+//
+// Returns a map between tensor ids and vectors of unique pointers to
+// TensorReferenceGroup, with each group potentially containing multiple
+// references.
+TensorGroups TensorReferenceGroup::accessedWithin(
+    isl::union_map outerSchedule,
+    isl::union_map reads,
+    isl::union_map writes) {
   TensorGroups tensorGroups;
-  auto domain = activeDomainPoints(scop.scheduleRoot(), tree);
-  auto schedule = partialSchedule(scop.scheduleRoot(), tree);
+  auto domain = outerSchedule.domain();
 
   addSingletonReferenceGroups(
-      tensorGroups, scop.writes, domain, schedule, AccessType::Write);
+      tensorGroups, writes, domain, outerSchedule, AccessType::Write);
   addSingletonReferenceGroups(
-      tensorGroups, scop.reads, domain, schedule, AccessType::Read);
+      tensorGroups, reads, domain, outerSchedule, AccessType::Read);
 
   // For each tensor, join groups whose footprints overlap and at least one
   // access is a write.  Do not join between tensors because no aliasing.
diff --git a/tc/core/polyhedral/memory_promotion.h b/tc/core/polyhedral/memory_promotion.h
@@ -111,9 +111,10 @@ class TensorReferenceGroup {
   TensorReferenceGroup() {}
 
  public:
-  static TensorGroups accessedBySubtree(
-      const detail::ScheduleTree* tree,
-      const Scop& scop);
+  static TensorGroups accessedWithin(
+      isl::union_map outerSchedule,
+      isl::union_map reads,
+      isl::union_map writes);
 
   bool isReadOnly() const;
 
diff --git a/tc/core/polyhedral/scop.cc b/tc/core/polyhedral/scop.cc
@@ -249,7 +249,7 @@ void Scop::promoteEverythingAt(std::vector<size_t> pos) {
   checkFiltersDisjointStatements(scheduleRoot());
   auto schedule = partialSchedule(root, tree);
 
-  auto groupMap = TensorReferenceGroup::accessedBySubtree(tree, *this);
+  auto groupMap = TensorReferenceGroup::accessedWithin(schedule, reads, writes);
   for (auto& p : groupMap) {
     for (auto& gr : p.second) {
       promoteGroup(
diff --git a/test/test_cuda_mapper_memory_promotion.cc b/test/test_cuda_mapper_memory_promotion.cc
@@ -51,6 +51,21 @@ class TestMapper : public ::testing::Test {
     return MappedScop::makeWithOuterBlockInnerThreadStrategy(
         std::move(scop), mappingOptions);
   }
+
+  // This mimics behavior of the old TensorReferenceGroup::accessedBySubtree.
+  // In particular, it takes the partial schedule until "tree" inclusive,
+  // intersecting its domain with all (mapping) filter ancestors and computes
+  // accessed tensor reference groups within that schedule.
+  // If the schedule happens to contain the block/thread mapping filter, the
+  // groups are per-block/thread.  Otherwise, they include all blocks/threads,
+  // which is questionable, but corresponds to what the test currently checks.
+  TensorGroups accessedBySubtree(
+      const polyhedral::detail::ScheduleTree* tree,
+      const Scop& scop) {
+    auto schedule = partialSchedule(scop.scheduleRoot(), tree);
+    return TensorReferenceGroup::accessedWithin(
+        schedule, scop.reads, scop.writes);
+  }
 };
 
 class MapperMemoryPromotion2DHelper : public TestMapper {
@@ -253,8 +268,7 @@ def fun(float(N, M) A, float(N, M) B) -> (C) {
     // Must force domain intersection for overapproximation to work
     scop.specializeToContext();
     auto ctx = scop.domain().get_ctx();
-    auto groups = TensorReferenceGroup::accessedBySubtree(
-        scop.scheduleRoot()->child(childPos), scop);
+    auto groups = accessedBySubtree(scop.scheduleRoot()->child(childPos), scop);
     LOG(INFO) << "Groups:\n" << groups;
 
     EXPECT_EQ(groups.size(), 3u);
@@ -333,8 +347,7 @@ def fun(float(N, M) A) -> (B, C) {
     // Must force domain intersection for overapproximation to work
     scop.specializeToContext();
     auto ctx = scop.domain().get_ctx();
-    auto groups = TensorReferenceGroup::accessedBySubtree(
-        scop.scheduleRoot()->child(childPos), scop);
+    auto groups = accessedBySubtree(scop.scheduleRoot()->child(childPos), scop);
     LOG(INFO) << "Groups:\n" << groups;
 
     ASSERT_EQ(groups.size(), 3u);
@@ -521,8 +534,8 @@ class Strided : public TestMapper {
     auto& scop = mscop->scop();
     auto ctx = scop.domain().get_ctx();
 
-    auto groups = TensorReferenceGroup::accessedBySubtree(
-        scop.scheduleRoot()->child({0, 0, 0}), scop);
+    auto groups =
+        accessedBySubtree(scop.scheduleRoot()->child({0, 0, 0}), scop);
     EXPECT_EQ(groups.size(), 2u) << "expected groups for both tensors";
 
     for (const auto& g : groups) {