register promotion: promote below any node in the tree

ftynse · ftynse · commit 31fd67bee0c2 · 2018-06-15T11:12:26.000+02:00
Extend promoteToRegistersBelow to take any schedule tree node as a
scoping point and attempt register promotion below that node.

Independently of the promotion scope, tensor reference groups must be
computed per-thread (and per-block).  Include the thread (and block)
mapping in the domain of the outer schedule when computing the groups.

Since registers are private to a thread, different threads should not
access the same value within the given scope.  Inside the promotion
scope, there may be multiple subtrees with different thread mappings.
Furthermore, these mappings are not included in the prefix (scope)
schedule.  Always include thread schedule when checking if accesses are
performed from one thread only.  This schedule takes into account
potentially different mappings.

Compute reuse within threads by appending thread schedule to the prefix
schedule before checking for reuse.
diff --git a/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc b/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc
@@ -649,27 +649,55 @@ void promoteGreedilyAtDepth(
   mapCopiesToThreads(mscop, unrollCopies);
 }
 
-namespace {
-
 /*
- * Perform promotion to registers below the thread specific marker "marker"
- * in the schedule tree of "mscop".
+ * Perform promotion to registers below the node "scope" in the schedule tree
+ * of "mscop".  Throw if promotion would violate the well-formedness of the
+ * schedule tree, in particular in cases of promotion immediately below
+ * a set/sequence node or immediately above a thread-specific marker node.
  */
-void promoteToRegistersBelow(MappedScop& mscop, detail::ScheduleTree* marker) {
+void promoteToRegistersBelow(MappedScop& mscop, detail::ScheduleTree* scope) {
+  // Cannot promote below a sequence or a set node.  Promotion may insert an
+  // extension node, but sequence/set must be followed by filters.
+  if (scope->elemAs<detail::ScheduleTreeElemSequence>() ||
+      scope->elemAs<detail::ScheduleTreeElemSet>()) {
+    throw promotion::IncorrectScope("cannot promote under a sequence/set node");
+  }
+  // Cannot promote between a thread-mapped band and a thread-specific marker
+  // node because the latter is used to identify thread-mapped bands as
+  // immediate ancestors.
+  if (scope->numChildren() == 1 &&
+      scope->child({0})
+          ->elemAs<detail::ScheduleTreeElemThreadSpecificMarker>()) {
+    throw promotion::IncorrectScope(
+        "cannot promote above a thread-specific marker node");
+  }
+
   auto& scop = mscop.scop();
   auto root = scop.scheduleRoot();
-  auto threadMapping = mscop.threadMappingSchedule(root);
 
-  auto partialSched = prefixSchedule(root, marker);
+  // Compute groups specific to threads and block by including the mappings
+  // into the domain of the partials schedule.
+  auto blockMapping = collectMappingsTo<mapping::BlockId>(scop);
+  auto mapping =
+      collectMappingsTo<mapping::ThreadId>(scop).intersect(blockMapping);
+  auto schedule = partialSchedule(scop.scheduleRoot(), scope);
+  auto groupMap = TensorReferenceGroup::accessedWithin(
+      schedule.intersect_domain(mapping), scop.reads, scop.writes);
+
+  auto threadSchedule = mscop.threadMappingSchedule(mscop.schedule());
+  auto blockSchedule = mscop.blockMappingSchedule(mscop.schedule());
+
   // Pure affine schedule without (mapping) filters.
-  auto partialSchedMupa = partialScheduleMupa(root, marker);
+  auto partialSchedMupa = partialScheduleMupa(root, scope);
+  // Schedule with block mapping filter.
+  auto partialSched =
+      isl::union_map::from(partialSchedMupa).intersect_domain(blockMapping);
+  // The following promotion validity and profitability checks need to be
+  // performed with respect to the block mapping, so append the block schedule.
+  // If the partial schedule contains it already, it will just end up with
+  // identical dimensions without affecting the result of the checks.
+  partialSchedMupa = partialSchedMupa.flat_range_product(blockSchedule);
 
-  // Because this function is called below the thread mapping marker,
-  // partialSched has been intersected with both the block and the thread
-  // mapping filters.   Therefore, groups will be computed relative to
-  // blocks and threads.
-  auto groupMap = TensorReferenceGroup::accessedWithin(
-      partialSched, scop.reads, scop.writes);
   for (auto& tensorGroups : groupMap) {
     auto tensorId = tensorGroups.first;
 
@@ -682,27 +710,28 @@ void promoteToRegistersBelow(MappedScop& mscop, detail::ScheduleTree* marker) {
         continue;
       }
       if (!isPromotableToRegistersBelow(
-              *group, root, marker, partialSchedMupa, threadMapping)) {
+              *group, root, scope, partialSchedMupa, threadSchedule)) {
         continue;
       }
-      if (!hasReuseWithin(*group, partialSchedMupa)) {
+      // Check reuse within threads.
+      auto schedule = partialSchedMupa.flat_range_product(threadSchedule);
+      if (!hasReuseWithin(*group, schedule)) {
         continue;
       }
+
       // TODO: if something is already in shared, but reuse it within one
       // thread only, there is no point in keeping it in shared _if_ it
       // gets promoted into a register.
       scop.promoteGroup(
           Scop::PromotedDecl::Kind::Register,
           tensorId,
           std::move(group),
-          marker,
+          scope,
           partialSched);
     }
   }
 }
 
-} // namespace
-
 // Promote at the positions of the thread specific markers.
 void promoteToRegistersBelowThreads(MappedScop& mscop, size_t nRegisters) {
   using namespace tc::polyhedral::detail;
diff --git a/tc/core/polyhedral/cuda/memory_promotion_heuristic.h b/tc/core/polyhedral/cuda/memory_promotion_heuristic.h
@@ -25,6 +25,10 @@ namespace polyhedral {
 class MappedScop;
 class Scop;
 
+namespace detail {
+class ScheduleTree;
+}
+
 // In the given mapped scop "mscop",
 // promote to shared memory at "depth" until "sharedMemorySize" is used.
 // Map copies between global and shared memory to threads and unroll those
@@ -38,6 +42,8 @@ void promoteGreedilyAtDepth(
     std::size_t sharedMemorySize,
     bool unrollCopies);
 
+void promoteToRegistersBelow(MappedScop& mscop, detail::ScheduleTree* scope);
+
 void promoteToRegistersBelowThreads(MappedScop& scop, std::size_t nRegisters);
 } // namespace polyhedral
 } // namespace tc
diff --git a/tc/core/polyhedral/exceptions.h b/tc/core/polyhedral/exceptions.h
@@ -62,6 +62,10 @@ struct PromotionNYI : public std::logic_error {
 struct GroupingError : public std::logic_error {
   explicit GroupingError(const std::string& s) : std::logic_error(s) {}
 };
+
+struct IncorrectScope : public std::logic_error {
+  explicit IncorrectScope(const std::string& s) : std::logic_error(s) {}
+};
 } // namespace promotion
 
 namespace codegen {
diff --git a/test/test_cuda_mapper_memory_promotion.cc b/test/test_cuda_mapper_memory_promotion.cc
@@ -451,8 +451,8 @@ TEST_F(MapperMemoryPromotionRAW, throwIfCopiesBelowThreads) {
 }
 
 class MatMulBias : public TestMapper {
- public:
-  std::string emitCode(
+ protected:
+  std::unique_ptr<MappedScop> prepare(
       const std::unordered_map<std::string, size_t>& parameters,
       const CudaMappingOptions& mappingOptions) {
     std::string tc = R"TC(
@@ -462,9 +462,45 @@ def fun(float(N,K) A, float(K,M) B, float(N,M) C) -> (O) {
 }
 )TC";
 
-    auto mscop = makeMappedScop(tc, mappingOptions, parameters);
+    return makeMappedScop(tc, mappingOptions, parameters);
+  }
+
+  std::string emitCode(const std::unique_ptr<MappedScop>& mscop) {
     return std::get<0>(mscop->codegen("fun"));
   }
+
+  std::string emitCode(
+      const std::unordered_map<std::string, size_t>& parameters,
+      const CudaMappingOptions& mappingOptions) {
+    return emitCode(prepare(parameters, mappingOptions));
+  }
+
+  void expectNoABCPromotion(const std::string& code) {
+    auto aDeclPos = code.find("  float32 _A_0");
+    auto bDeclPos = code.find("  float32 _B_0");
+    auto cDeclPos = code.find("  float32 _C_0");
+    EXPECT_TRUE(aDeclPos == std::string::npos)
+        << "tensor A promoted to register but has elements accessed "
+        << "by multiple threads";
+    EXPECT_TRUE(bDeclPos == std::string::npos)
+        << "tensor B promoted to register but has elements accessed "
+        << "by multiple threads";
+    EXPECT_TRUE(cDeclPos == std::string::npos)
+        << "tensor C promoted to register but has no reuse";
+  }
+
+  void expectNoSymbolicSubscript(const std::string& code) {
+    // We don't know the exact name of the iterator, but it starts with c.
+    auto oWithIteratorPos = code.find("_O_0[c");
+    auto oWithThreadPos = code.find("_O_0[t1");
+
+    EXPECT_TRUE(oWithIteratorPos == std::string::npos)
+        << "accessing local arrays with iterators in subscripts makes "
+        << "these arrays placed in local memory instead of registers";
+    EXPECT_TRUE(oWithThreadPos == std::string::npos)
+        << "expected per-thread groups to be computed, i.e. thread "
+        << "identifiers should not appear in the subscripts";
+  }
 };
 
 TEST_F(MatMulBias, RegisterPromotion) {
@@ -482,8 +518,6 @@ TEST_F(MatMulBias, RegisterPromotion) {
 
   auto originalAccPos =
       code.find("O[32 * b0 + c3][t0 + 32 * b1]", copyToPos + 1);
-  auto cDeclPos = code.find("float32 _C_0");
-  auto aDeclPos = code.find("float32 _A_0");
 
   EXPECT_TRUE(declPos != std::string::npos) << "no declaration of the register";
   EXPECT_TRUE(copyToPos != std::string::npos) << "expected copy to register";
@@ -492,10 +526,8 @@ TEST_F(MatMulBias, RegisterPromotion) {
 
   EXPECT_NE(originalAccPos, copyFromPos)
       << "global array reference is used in main computation";
-  EXPECT_TRUE(cDeclPos == std::string::npos)
-      << "tensor C promoted to register but has no reuse";
-  EXPECT_TRUE(aDeclPos == std::string::npos)
-      << "tensor A promoted to register but has elements accessed by multiple threads";
+
+  expectNoABCPromotion(code);
 }
 
 TEST_F(MatMulBias, RegisterPromotionSharedPreference) {
@@ -506,16 +538,93 @@ TEST_F(MatMulBias, RegisterPromotionSharedPreference) {
                             .usePrivateMemory(true);
 
   auto code = emitCode({{"N", 42}, {"M", 56}, {"K", 37}}, mappingOptions);
-  auto declPos = code.find("float32 _O_0[1][1]");
-  auto cDeclPos = code.find("float32 _C_0[1][1]");
-  auto aDeclPos = code.find("float32 _A_0[1][1]");
 
+  auto declPos = code.find("float32 _O_0[1][1]");
   EXPECT_TRUE(declPos == std::string::npos)
       << "not expected promotion to register because promoted to shared";
-  EXPECT_TRUE(cDeclPos == std::string::npos)
-      << "tensor C promoted to register but has no reuse";
-  EXPECT_TRUE(aDeclPos == std::string::npos)
-      << "tensor A promoted to register but has elements accessed by multiple threads";
+
+  expectNoABCPromotion(code);
+}
+
+TEST_F(MatMulBias, RegistersAtRoot) {
+  // Disable automatic promotion to registers because we are going to call it
+  // manually.  Require sufficient unrolling to actually hit registers.
+  auto mappingOptions = CudaMappingOptions::makeNaiveMappingOptions()
+                            .unroll(512)
+                            .useSharedMemory(false)
+                            .usePrivateMemory(false);
+
+  auto mscop = prepare({{"N", 42}, {"M", 56}, {"K", 37}}, mappingOptions);
+  promoteToRegistersBelow(*mscop, mscop->scop().scheduleRoot());
+  auto code = emitCode(mscop);
+
+  // Expecting 4 elements because we map the loop i in O[i][j] to 8 threads
+  // after tiling by 32.
+  auto oDeclPos = code.find("float32 _O_0[4][1];");
+  EXPECT_TRUE(oDeclPos != std::string::npos)
+      << "expected O to be promoted to registers";
+
+  expectNoABCPromotion(code);
+  expectNoSymbolicSubscript(code);
+
+  auto o00Pos = code.find("_O_0[0][0]");
+  auto o10Pos = code.find("_O_0[1][0]");
+  auto o20Pos = code.find("_O_0[2][0]");
+  auto o30Pos = code.find("_O_0[3][0]");
+
+  EXPECT_TRUE(o00Pos != std::string::npos)
+      << "expected constant subscripts in _O_0";
+  EXPECT_TRUE(o10Pos != std::string::npos)
+      << "expected constant subscripts in _O_0";
+  EXPECT_TRUE(o20Pos != std::string::npos)
+      << "expected constant subscripts in _O_0";
+  EXPECT_TRUE(o30Pos != std::string::npos)
+      << "expected constant subscripts in _O_0";
+}
+
+TEST_F(MatMulBias, RegistersAtRootNotEnoughUnroll) {
+  // Disable automatic promotion to registers because we are going to call it
+  // manually.  Require no unrolling so as to make promotion to registers
+  // invalid.
+  auto mappingOptions = CudaMappingOptions::makeNaiveMappingOptions()
+                            .unroll(1)
+                            .useSharedMemory(false)
+                            .usePrivateMemory(false);
+
+  auto mscop = prepare({{"N", 42}, {"M", 56}, {"K", 37}}, mappingOptions);
+  promoteToRegistersBelow(*mscop, mscop->scop().scheduleRoot());
+  auto code = emitCode(mscop);
+  auto oDeclPos = code.find("float32 _O_0;");
+
+  EXPECT_TRUE(oDeclPos == std::string::npos)
+      << "not expected O to be promoted to registers";
+
+  expectNoABCPromotion(code);
+  expectNoSymbolicSubscript(code);
+}
+
+TEST_F(MatMulBias, RegistersBelowFirstBand) {
+  using namespace polyhedral::detail;
+
+  // Disable automatic promotion to registers because we are going to call it
+  // manually.
+  auto mappingOptions = CudaMappingOptions::makeNaiveMappingOptions()
+                            .useSharedMemory(false)
+                            .usePrivateMemory(false);
+  auto mscop = prepare({{"N", 42}, {"M", 56}, {"K", 37}}, mappingOptions);
+
+  auto nodes = ScheduleTree::collectDFSPostorder(
+      mscop->scop().scheduleRoot(), ScheduleTreeType::Band);
+  ASSERT_GT(nodes.size(), 0u);
+  auto node = nodes[0];
+  promoteToRegistersBelow(*mscop, node);
+  auto code = emitCode(mscop);
+
+  auto oDeclPos = code.find("float32 _O_0[1][1];");
+  EXPECT_TRUE(oDeclPos != std::string::npos)
+      << "expected O to be promoted to registers";
+  expectNoABCPromotion(code);
+  expectNoSymbolicSubscript(code);
 }
 
 class Strided : public TestMapper {