merge promoteToSharedGreedy and promoteGreedilyAtDepth into one function

ftynse · ftynse · commit 5b448f084edd · 2018-07-12T13:11:43.000+02:00
Arguably, it was a mistake to have separate functions in the first
place.  This led to situations in tests where the copies between global
and shared memory were not mapped to threads.  Merge
promoteToSharedGreedy and promoteGreedilyAtDepth into a single function,
promoteToSharedAtDepth.  The name is chosen for consistency with
promoteToRegistersAtDepth.
diff --git a/tc/core/polyhedral/cuda/mapped_scop.cc b/tc/core/polyhedral/cuda/mapped_scop.cc
@@ -1079,7 +1079,7 @@ std::unique_ptr<MappedScop> MappedScop::makeWithOuterBlockInnerThreadStrategy(
               !generic.proto.has_unroll())
           << "requested to unroll copies to shared memory without providing the unroll size";
 
-      promoteGreedilyAtDepth(
+      promoteToSharedAtDepth(
           *mappedScop,
           std::min(band->nOuterCoincident(), mappedScop->numBlocks.view.size()),
           sharedMemorySize,
diff --git a/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc b/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc
@@ -540,6 +540,16 @@ void promoteToSharedBelow(
   scop.insertSyncsAroundCopies(node);
 }
 
+/*
+ * Check if "tree" is a band node mapped to threads.  In particular, check that
+ * "tree" is a band and a thread-specific node appears as its only child.
+ */
+inline bool isThreadMappedBand(const detail::ScheduleTree* tree) {
+  return matchOne(band(threadSpecific(any())), tree) ||
+      matchOne(band(threadSpecific()), tree);
+}
+} // namespace
+
 /*
  * For every place in the schedule tree where schedule depth (i.e., the number
  * of preceding band members) is "depth", promote tensor reference groups to
@@ -550,14 +560,22 @@ void promoteToSharedBelow(
  *
  * Only promote if the tensor elements referenced by the group are reused or
  * accessed in a non-coalesced way.
+ *
+ * If "unrollCopies" is set, use the unroll factor from "mscop" to unroll the
+ * loops that copy values from global to shared memory and back.
  */
-void promoteToSharedGreedy(Scop& scop, size_t depth, size_t maxMemory) {
+void promoteToSharedAtDepth(
+    MappedScop& mscop,
+    size_t depth,
+    size_t maxMemory,
+    bool unrollCopies) {
   using namespace tc::polyhedral::detail;
 
   if (depth == 0) {
     throw promotion::PromotionNYI("promotion before any band");
   }
 
+  auto& scop = mscop.scop();
   auto root = scop.scheduleRoot();
 
   // 1. Collect all bands with a member located at the given depth in the
@@ -575,27 +593,8 @@ void promoteToSharedGreedy(Scop& scop, size_t depth, size_t maxMemory) {
   for (auto bandNode : bands) {
     promoteToSharedBelow(scop, bandNode, remainingMemory);
   }
-}
-
-/*
- * Check if "tree" is a band node mapped to threads.  In particular, check that
- * "tree" is a band and a thread-specific node appears as its only child.
- */
-inline bool isThreadMappedBand(const detail::ScheduleTree* tree) {
-  return matchOne(band(threadSpecific(any())), tree) ||
-      matchOne(band(threadSpecific()), tree);
-}
-} // namespace
-
-void promoteGreedilyAtDepth(
-    MappedScop& mscop,
-    size_t depth,
-    size_t sharedMemorySize,
-    bool unrollCopies) {
-  // 1. Promote using heuristic.
-  promoteToSharedGreedy(mscop.scop(), depth, sharedMemorySize);
 
-  // 2. Map copies to shared, state by copy
+  // 3. Map copies to shared.
   mapCopiesToThreads(mscop, unrollCopies);
 }
 
diff --git a/tc/core/polyhedral/cuda/memory_promotion_heuristic.h b/tc/core/polyhedral/cuda/memory_promotion_heuristic.h
@@ -33,10 +33,7 @@ class ScheduleTree;
 // promote to shared memory at "depth" until "sharedMemorySize" is used.
 // Map copies between global and shared memory to threads and unroll those
 // copies if "unrollCopies" is set, using the options in "mscop".
-// "threadIdxXScheduleDepthState" contains the schedule depth at which the
-// computation was mapped to thread x and is used to check whether the global
-// memory is accessed in a coalesced way.
-void promoteGreedilyAtDepth(
+void promoteToSharedAtDepth(
     MappedScop& scop,
     std::size_t depth,
     std::size_t sharedMemorySize,
diff --git a/test/test_cuda_mapper_memory_promotion.cc b/test/test_cuda_mapper_memory_promotion.cc
@@ -392,7 +392,7 @@ def fun(float(N, M) A) -> (B, C) {
       size_t maxSharedMemory) {
     auto mscop = prepareScop(
         tc, {{"N", problemSize1}, {"M", problemSize2}}, {tileSize1, tileSize2});
-    promoteGreedilyAtDepth(*mscop, depth, maxSharedMemory, false);
+    promoteToSharedAtDepth(*mscop, depth, maxSharedMemory, false);
     return mscop;
   }
 };

Original file line number	Diff line number	Diff line change
`@@ -392,7 +392,7 @@ def fun(float(N, M) A) -> (B, C) {`
`392`	`392`	`size_t maxSharedMemory) {`
`393`	`393`	`auto mscop = prepareScop(`
`394`	`394`	`tc, {{"N", problemSize1}, {"M", problemSize2}}, {tileSize1, tileSize2});`
`395`		`- promoteGreedilyAtDepth(*mscop, depth, maxSharedMemory, false);`
	`395`	`+ promoteToSharedAtDepth(*mscop, depth, maxSharedMemory, false);`
`396`	`396`	`return mscop;`
`397`	`397`	`}`
`398`	`398`	`};`