promoteToRegistersBelowThreads: use markers to find location below threads

Sven Verdoolaege · Sven Verdoolaege · commit e11308da5cf2 · 2018-04-26T15:13:23.000+02:00
This greatly simplifies the code and is a step towards
the removal of ThreadIdxXScheduleDepthState.

Note that since mapCopiesToThreads inserts such a marker,
while it does not set ThreadIdxXScheduleDepthState,
this means that register promotion will now kick in
on the shared memory copying code, but it will not
actually perform register promotion because it won't find any reuse.
If these checks for reuse turn out to be too expensive,
an explicit check for this situation can be added
to the register promotion later.
diff --git a/tc/core/polyhedral/cuda/mapped_scop.cc b/tc/core/polyhedral/cuda/mapped_scop.cc
@@ -705,8 +705,7 @@ std::unique_ptr<MappedScop> MappedScop::makeWithOuterBlockInnerThreadStrategy(
 
   // 8. Promote to registers below the loops mapped to threads.
   if (cudaOptions.proto().use_private_memory()) {
-    promoteToRegistersBelowThreads(
-        mappedScop->scop(), mappedScop->threadIdxXScheduleDepthState, -1ull);
+    promoteToRegistersBelowThreads(mappedScop->scop(), -1ull);
   }
 
   // 9. Insert mapping context
diff --git a/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc b/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc
@@ -27,6 +27,7 @@
 #include <algorithm>
 #include <numeric>
 #include <sstream>
+#include <type_traits>
 
 namespace tc {
 namespace polyhedral {
@@ -128,6 +129,21 @@ void mapCopiesToThreads(MappedScop& mscop, bool unroll) {
   }
 }
 
+/*
+ * Starting from the root, find all thread specific markers.  Use
+ * DFSPreorder to make sure order is specified and consistent for tests.
+ */
+template <typename T>
+std::vector<T> findThreadSpecificMarkers(T root) {
+  using namespace tc::polyhedral::detail;
+  static_assert(
+      std::is_convertible<T, const ScheduleTree*>::value,
+      "expecting ScheduleTree");
+
+  return ScheduleTree::collectDFSPreorder(
+      root, ScheduleTreeType::ThreadSpecificMarker);
+}
+
 /*
  * Transform schedule bands into a union_map.
  * Takes all partial schedules at leaves as MUPAs (without accounting for
@@ -555,51 +571,28 @@ void promoteGreedilyAtDepth(
   mapCopiesToThreads(mscop, unrollCopies);
 }
 
-// Assuming the mapping to threads happens in inverse order, i.e. the innermost
-// loop is mapped to thread x, promote below that depth.
-void promoteToRegistersBelowThreads(
-    Scop& scop,
-    const ThreadIdxXScheduleDepthState& threadIdxXScheduleDepthState,
-    size_t nRegisters) {
+// Promote at the positions of the thread specific markers.
+void promoteToRegistersBelowThreads(Scop& scop, size_t nRegisters) {
   using namespace tc::polyhedral::detail;
 
   auto root = scop.scheduleRoot();
 
   auto fullSched = fullSchedule(root);
-  for (const auto& kvp : threadIdxXScheduleDepthState) {
-    auto depth = kvp.second + 1;
-    auto subdomain = kvp.first;
-
-    // Collect all bands where a member is located at the given depth.
-    auto bands = bandsContainingScheduleDepth(root, depth);
-    // We may have no band members mapped to thread x in case when we
-    // force-mapped everything to one thread.
-    if (bands.size() == 0) {
-      continue;
-    }
-
-    // Keep only those bands for which this depth was recorded.
-    std::function<bool(ScheduleTree*)> keepActive =
-        [root, subdomain](const ScheduleTree* tree) {
-          isl::union_set active = activeDomainPoints(root, tree);
-          return !active.intersect(subdomain).is_empty();
-        };
-    bands = functional::Filter(keepActive, bands);
-
-    // Make sure the band ends at thread x depth so we can promote below it.
-    bands = bandsSplitAfterDepth(bands, root, depth);
+  {
+    auto markers = findThreadSpecificMarkers(root);
 
-    for (auto band : bands) {
+    for (auto marker : markers) {
       // Find out how many threads are actually mapped.  Active domain points
       // will involve all mapping parameters when we take them below the
       // mapping.  Skip mapping parameters obviously mapped to 0, because they
       // do not correspond to band members that should be fixed to obtain
       // per-thread-group access relations.
-      auto points = activeDomainPoints(root, band);
-      auto partialSched = partialSchedule(root, band);
+      auto points = activeDomainPoints(root, marker);
+      auto partialSched = prefixSchedule(root, marker);
       // Pure affine schedule without (mapping) filters.
-      auto partialSchedMupa = partialScheduleMupa(root, band);
+      auto partialSchedMupa = prefixScheduleMupa(root, marker);
 
+      auto depth = marker->scheduleDepth(root);
       size_t nMappedThreads = 0;
       for (unsigned j = 0; j < points.dim(isl::dim_type::param); ++j) {
         auto id = points.get_space().get_dim_id(isl::dim_type::param, j);
@@ -616,7 +609,7 @@ void promoteToRegistersBelowThreads(
         }
       }
 
-      auto groupMap = TensorReferenceGroup::accessedBySubtree(band, scop);
+      auto groupMap = TensorReferenceGroup::accessedBySubtree(marker, scop);
       for (auto& tensorGroups : groupMap) {
         auto tensorId = tensorGroups.first;
 
@@ -642,7 +635,7 @@ void promoteToRegistersBelowThreads(
               Scop::PromotedDecl::Kind::Register,
               tensorId,
               std::move(group),
-              band,
+              marker,
               partialSched);
         }
       }
diff --git a/tc/core/polyhedral/cuda/memory_promotion_heuristic.h b/tc/core/polyhedral/cuda/memory_promotion_heuristic.h
@@ -42,9 +42,6 @@ void promoteGreedilyAtDepth(
     std::size_t sharedMemorySize,
     bool unrollCopies);
 
-void promoteToRegistersBelowThreads(
-    Scop& scop,
-    const ThreadIdxXScheduleDepthState& threadIdxXScheduleDepthState,
-    std::size_t nRegisters);
+void promoteToRegistersBelowThreads(Scop& scop, std::size_t nRegisters);
 } // namespace polyhedral
 } // namespace tc

Original file line number	Diff line number	Diff line change
`@@ -705,8 +705,7 @@ std::unique_ptr<MappedScop> MappedScop::makeWithOuterBlockInnerThreadStrategy(`
`705`	`705`
`706`	`706`	`// 8. Promote to registers below the loops mapped to threads.`
`707`	`707`	`if (cudaOptions.proto().use_private_memory()) {`
`708`		`- promoteToRegistersBelowThreads(`
`709`		`- mappedScop->scop(), mappedScop->threadIdxXScheduleDepthState, -1ull);`
	`708`	`+ promoteToRegistersBelowThreads(mappedScop->scop(), -1ull);`
`710`	`709`	`}`
`711`	`710`
`712`	`711`	`// 9. Insert mapping context`