promoteToRegistersBelow: limit the number of registers to use

ftynse · ftynse · commit 9155a07556da · 2018-07-26T11:23:22.000+02:00
Introduce the per-thread limit on the total number of registers to use
during promotion.  This limit does not differentiate between the data
types because we cannot control the register allocation at CUDA level
anyway.  It rather serves as a controllable input to the promotion
heuristic.
diff --git a/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc b/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc
@@ -630,8 +630,17 @@ void promoteToSharedAtDepth(
  * of "mscop".  Throw if promotion would violate the well-formedness of the
  * schedule tree, in particular in cases of promotion immediately below
  * a set/sequence node or immediately above a thread-specific marker node.
+ * Promote at most "maxElements" elements per thread and return the difference
+ * between "maxElements" and the number of actuall promoted elements.  Note
+ * that this function does not differentitate types and sizes of the promoted
+ * elements because register allocation cannot be controlled at the CUDA level
+ * anyway.  Instead, the "maxElements" value controls how much register
+ * promotion is performed overall.
  */
-void promoteToRegistersBelow(MappedScop& mscop, detail::ScheduleTree* scope) {
+size_t promoteToRegistersBelow(
+    MappedScop& mscop,
+    detail::ScheduleTree* scope,
+    size_t maxElements) {
   // Cannot promote below a sequence or a set node.  Promotion may insert an
   // extension node, but sequence/set must be followed by filters.
   if (scope->as<detail::ScheduleTreeSequence>() ||
@@ -684,6 +693,12 @@ void promoteToRegistersBelow(MappedScop& mscop, detail::ScheduleTree* scope) {
       if (sizes.size() == 0) {
         continue;
       }
+      // Do not promote if requires more registers than remaining.
+      auto nElements = std::accumulate(
+          sizes.begin(), sizes.end(), 1u, std::multiplies<size_t>());
+      if (nElements > maxElements) {
+        continue;
+      }
       if (!isPromotableToRegistersBelow(
               *group, root, scope, partialSchedMupa, threadSchedule)) {
         continue;
@@ -703,13 +718,14 @@ void promoteToRegistersBelow(MappedScop& mscop, detail::ScheduleTree* scope) {
           std::move(group),
           scope,
           partialSched);
+      maxElements -= nElements;
     }
   }
 
   // Return immediately if nothing was promoted.
   if (scope->numChildren() == 0 ||
       !matchOne(extension(sequence(any())), scope->child({0}))) {
-    return;
+    return maxElements;
   }
 
   // If promoting above thread mapping, insert synchronizations.
@@ -725,6 +741,7 @@ void promoteToRegistersBelow(MappedScop& mscop, detail::ScheduleTree* scope) {
   if (functional::Filter(isMappingTo<mapping::ThreadId>, ancestors).empty()) {
     scop.insertSyncsAroundSeqChildren(scope->child({0, 0}));
   }
+  return maxElements;
 }
 
 /*
diff --git a/tc/core/polyhedral/cuda/memory_promotion_heuristic.h b/tc/core/polyhedral/cuda/memory_promotion_heuristic.h
@@ -41,7 +41,10 @@ void promoteToSharedAtDepth(
     std::size_t sharedMemorySize,
     bool unrollCopies);
 
-void promoteToRegistersBelow(MappedScop& mscop, detail::ScheduleTree* scope);
+size_t promoteToRegistersBelow(
+    MappedScop& mscop,
+    detail::ScheduleTree* scope,
+    std::size_t maxElements = SIZE_MAX);
 
 void promoteToRegistersAtDepth(MappedScop& scop, std::size_t depth);