Merge pull request #521 from facebookresearch/registers-at-depth

ftynse · web-flow · commit 9f7300a872fd · 2018-07-03T15:22:46.000+02:00
Tunable register promotion depth
diff --git a/tc/autotuner/autotuner-inl.h b/tc/autotuner/autotuner-inl.h
@@ -372,6 +372,8 @@ void setupTuningParameters(
   configuration.gridParams.setRange(range, "g");
   configuration.unrollFactor =
       RangeParameter(powers2(FLAGS_tuner_max_unroll_size), "unroll");
+  configuration.privateDepth =
+      RangeParameter({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, "pdepth");
 }
 } // namespace
 
diff --git a/tc/autotuner/parameters.cc b/tc/autotuner/parameters.cc
@@ -239,6 +239,7 @@ void TuningConfiguration::applyToParameters(
   unrollCopyShared.apply(f);
   useReadOnlyCache.apply(f);
   matchLibraryCalls.apply(f);
+  privateDepth.apply(f);
 }
 
 bool TuningConfiguration::isValid() const {
@@ -273,6 +274,7 @@ std::vector<ParameterView> TuningConfiguration::collectParameters() {
   params.emplace_back(unrollCopyShared);
   params.emplace_back(useReadOnlyCache);
   params.emplace_back(matchLibraryCalls);
+  params.emplace_back(privateDepth);
 
   return params;
 }
@@ -303,6 +305,7 @@ void TuningConfiguration::fromCudaMappingOptions(
   usePrivateMemory.selectValue(options.proto().use_private_memory());
   unrollCopyShared.selectValue(options.proto().unroll_copy_shared());
   useReadOnlyCache.selectValue(options.proto().use_readonly_cache());
+  privateDepth.selectFromValue(options.proto().private_depth());
 }
 
 void TuningConfiguration::fromCpuMappingOptions(
@@ -331,6 +334,7 @@ void TuningConfiguration::applyToCudaMappingOptions(
   options.usePrivateMemory(usePrivateMemory.value());
   options.unrollCopyShared(unrollCopyShared.value());
   options.useReadOnlyCache(useReadOnlyCache.value());
+  options.privateDepth(privateDepth.value());
 }
 
 void TuningConfiguration::applyToCpuMappingOptions(
diff --git a/tc/autotuner/parameters.h b/tc/autotuner/parameters.h
@@ -190,6 +190,7 @@ class TuningConfiguration {
   BoolParameter unrollCopyShared;
   BoolParameter useReadOnlyCache;
   BoolParameter matchLibraryCalls;
+  RangeParameter privateDepth;
 
  private:
   std::vector<std::function<bool(const TuningConfiguration&)>> validators_;
@@ -227,6 +228,7 @@ class TuningParameterFixer {
   llvm::Optional<bool> unrollCopyShared;
   llvm::Optional<bool> useReadOnlyCache;
   llvm::Optional<bool> matchLibraryCalls;
+  llvm::Optional<uint32_t> privateDepth;
 
   friend class TuningConfiguration;
 };
diff --git a/tc/core/cuda/cuda_mapping_options.cc b/tc/core/cuda/cuda_mapping_options.cc
@@ -289,6 +289,11 @@ CudaMappingOptions& CudaMappingOptions::useReadOnlyCache(bool b) {
   return *this;
 }
 
+CudaMappingOptions& CudaMappingOptions::privateDepth(uint32_t depth) {
+  ownedProto_.set_private_depth(depth);
+  return *this;
+}
+
 CudaMappingOptions& CudaMappingOptions::mapToThreads(
     const std::string& commaSeparatedSizes) {
   auto sizes = parseCommaSeparatedIntegers<uint64_t>(commaSeparatedSizes);
diff --git a/tc/core/cuda/cuda_mapping_options.h b/tc/core/cuda/cuda_mapping_options.h
@@ -195,6 +195,7 @@ class CudaMappingOptions {
   CudaMappingOptions& maxSharedMemory(uint64_t size);
   CudaMappingOptions& unrollCopyShared(bool b);
   CudaMappingOptions& useReadOnlyCache(bool b);
+  CudaMappingOptions& privateDepth(uint32_t depth);
   ///@}
 
   /// Static constructors for predefined strategies.
diff --git a/tc/core/cuda/cuda_mapping_options_cpp_printer.cc b/tc/core/cuda/cuda_mapping_options_cpp_printer.cc
@@ -38,6 +38,7 @@ CudaMappingOptionsCppPrinter& operator<<(
     prn.printValueOption(
         "maxSharedMemory", cudaOptions.proto().max_shared_memory());
   }
+  prn.printValueOption("privateDepth", cudaOptions.proto().private_depth());
   prn.endStmt();
   return prn;
 }
diff --git a/tc/core/polyhedral/cuda/mapped_scop.cc b/tc/core/polyhedral/cuda/mapped_scop.cc
@@ -1068,7 +1068,7 @@ std::unique_ptr<MappedScop> MappedScop::makeWithOuterBlockInnerThreadStrategy(
 
   // 9. Promote to registers below the loops mapped to threads.
   if (cudaOptions.proto().use_private_memory()) {
-    promoteToRegistersBelowThreads(*mappedScop, -1ull);
+    promoteToRegistersAtDepth(*mappedScop, cudaOptions.proto().private_depth());
   }
 
   LOG_IF(INFO, FLAGS_debug_tc_mapper)
diff --git a/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc b/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc
@@ -437,14 +437,20 @@ bool isPromotableToRegistersBelow(
 }
 
 /*
- * Starting from the root, find bands where depth is reached.  Using
+ * Starting from the root, find bands where depth is reached.  If zero depth is
+ * requested, insert a zero-dimensional band node below the root (or the
+ * context node if present) and return it.  Otherwise, use
  * DFSPreorder to make sure order is specified and consistent for tests.
  */
 std::vector<detail::ScheduleTree*> bandsContainingScheduleDepth(
     detail::ScheduleTree* root,
     size_t depth) {
   using namespace tc::polyhedral::detail;
 
+  if (depth == 0) {
+    return {insertTopLevelEmptyBand(root)};
+  }
+
   auto bands =
       ScheduleTree::collectDFSPreorder(root, detail::ScheduleTreeType::Band);
   std::function<bool(ScheduleTree * st)> containsDepth = [&](ScheduleTree* st) {
@@ -602,6 +608,15 @@ void promoteToSharedGreedy(
     scop.insertSyncsAroundCopies(bandNode);
   }
 }
+
+/*
+ * Check if "tree" is a band node mapped to threads.  In particular, check that
+ * "tree" is a band and a thread-specific node appears as its only child.
+ */
+inline bool isThreadMappedBand(const detail::ScheduleTree* tree) {
+  return matchOne(band(threadSpecific(any())), tree) ||
+      matchOne(band(threadSpecific()), tree);
+}
 } // namespace
 
 void promoteGreedilyAtDepth(
@@ -698,16 +713,69 @@ void promoteToRegistersBelow(MappedScop& mscop, detail::ScheduleTree* scope) {
           partialSched);
     }
   }
+
+  // Return immediately if nothing was promoted.
+  if (scope->numChildren() == 0 ||
+      !matchOne(extension(sequence(any())), scope->child({0}))) {
+    return;
+  }
+
+  // If promoting above thread mapping, insert synchronizations.
+  // It is possible that promoted array elements are accessed by different
+  // threads outside the current scope (either in different iterations of the
+  // scope loops, or in sibling subtrees).  For now, always insert
+  // synchronizations, similarly to copies to shared memory.
+  //
+  // TODO: The exact check for sync insertion requires the dependences between
+  // the elements in the scope and those before/after the scope and a check if
+  // the dependent instances belong to the same thread.
+  auto ancestors = scope->ancestors(root);
+  if (functional::Filter(isMappingTo<mapping::ThreadId>, ancestors).empty()) {
+    scop.insertSyncsAroundSeqChildren(scope->child({0, 0}));
+  }
 }
 
-// Promote at the positions of the thread specific markers.
-void promoteToRegistersBelowThreads(MappedScop& mscop, size_t nRegisters) {
-  auto& scop = mscop.scop();
-  auto root = scop.scheduleRoot();
-  auto markers = findThreadSpecificMarkers(root);
+/*
+ * Promote to registers below "depth" schedule dimensions.  Split bands if
+ * necessary to create promotion scopes.  Do not promote if it would require
+ * splitting the band mapped to threads as we assume only one band can be
+ * mapped.
+ */
+void promoteToRegistersAtDepth(MappedScop& mscop, size_t depth) {
+  using namespace detail;
 
-  for (auto marker : markers) {
-    promoteToRegistersBelow(mscop, marker);
+  auto root = mscop.scop().scheduleRoot();
+
+  // 1. Collect all bands with a member located at the given depth in the
+  // overall schedule.  Make sure this is the last member of the band by
+  // splitting off the subsequent members into a different band.  Ignore bands
+  // mapped to threads if splitting is required as it would break the invariant
+  // of a single band being mapped to threads in a subtree.
+  // TODO: allow splitting the thread-mapped bands; for example, tile them
+  // explicitly with block size, use the point loops for thread mapping
+  // but ignore them in depth computation.
+  auto bands = bandsContainingScheduleDepth(root, depth);
+  bands = functional::Filter(
+      [root, depth](ScheduleTree* tree) {
+        auto band = tree->elemAs<ScheduleTreeElemBand>();
+        return !isThreadMappedBand(tree) ||
+            tree->scheduleDepth(root) + band->nMember() == depth;
+      },
+      bands);
+  bands = bandsSplitAfterDepth(bands, root, depth);
+
+  // 2. We don't want copies inserted between thread-mapped bands and the
+  // thread-specific marker, but rather below that marker.  If any of the bands
+  // are mapped to threads, take their first children as promotion scope
+  // instead of the band itself.
+  std::function<ScheduleTree*(ScheduleTree*)> findScope =
+      [](ScheduleTree* tree) {
+        return isThreadMappedBand(tree) ? tree->child({0}) : tree;
+      };
+  auto scopes = functional::Map(findScope, bands);
+
+  for (auto scope : scopes) {
+    promoteToRegistersBelow(mscop, scope);
   }
 }
 
diff --git a/tc/core/polyhedral/cuda/memory_promotion_heuristic.h b/tc/core/polyhedral/cuda/memory_promotion_heuristic.h
@@ -44,6 +44,7 @@ void promoteGreedilyAtDepth(
 
 void promoteToRegistersBelow(MappedScop& mscop, detail::ScheduleTree* scope);
 
-void promoteToRegistersBelowThreads(MappedScop& scop, std::size_t nRegisters);
+void promoteToRegistersAtDepth(MappedScop& scop, std::size_t depth);
+
 } // namespace polyhedral
 } // namespace tc
diff --git a/tc/core/polyhedral/functional.h b/tc/core/polyhedral/functional.h
@@ -100,6 +100,18 @@ I Reduce(std::function<I(I&&, I&&)> red, std::vector<I>&& vec) {
   return res;
 }
 
+namespace {
+// Allow combinations:
+//     LHS     RHS
+// const T and T
+// const T and const T
+template <typename LHS, typename RHS>
+struct is_same_allow_rhs_non_const {
+  const static bool value =
+      std::is_same<LHS, typename std::add_const<RHS>::type>::value;
+};
+} // namespace
+
 /*
  * Return a copy of the vector that contains only those elements for which
  * function "f" returns true.
@@ -108,8 +120,12 @@ I Reduce(std::function<I(I&&, I&&)> red, std::vector<I>&& vec) {
  * functor) with signature <bool(T)>.
  *
  * Template argument deduction takes care of vector<const T> cases
- * automatically.  Note that the signature of "f" must use the same type, that
- * is "const T".
+ * automatically.   Additionally, accept vectors of (pointers to) non-const
+ * elements and a function with (a pointer to) a const argument.
+ * Note that the function type is a template argument guarded by static asserts
+ * rather than an std::function because template argument deduction fails on
+ * lambdas in the latter case; lambdas are convertible to std::function but are
+ * not an instance of std::function.
  */
 template <typename Func, typename T>
 std::vector<T> Filter(Func f, const std::vector<T>& input) {
@@ -119,9 +135,25 @@ std::vector<T> Filter(Func f, const std::vector<T>& input) {
   static_assert(
       function_traits<Func>::n_args == 1,
       "Filtering function must take one argument");
+
+  // Allow combinations:
+  // Function arg      Vector element
+  //            T  and T
+  //      const T  and T
+  //      const T  and const T
+  //            T* and T*
+  //      const T* and T*
+  //      const T* and const T*
+  using arg0 = typename function_traits<Func>::template arg<0>::type;
+  constexpr bool sameType = std::is_same<arg0, T>::value;
+  constexpr bool sameTypeRightMbConst =
+      is_same_allow_rhs_non_const<arg0, T>::value;
+  constexpr bool ptrToSameTypeRightMbConst = std::is_pointer<arg0>::value &&
+      std::is_pointer<T>::value &&
+      is_same_allow_rhs_non_const<typename std::remove_pointer<arg0>::type,
+                                  typename std::remove_pointer<T>::type>::value;
   static_assert(
-      std::is_same<typename function_traits<Func>::template arg<0>::type, T>::
-          value,
+      sameType || sameTypeRightMbConst || ptrToSameTypeRightMbConst,
       "The argument of the filtering function must have the same type "
       "as the element type of the collection being filtered");
 
diff --git a/tc/core/polyhedral/schedule_transforms.cc b/tc/core/polyhedral/schedule_transforms.cc
@@ -275,6 +275,15 @@ ScheduleTree* bandScale(ScheduleTree* tree, const vector<size_t>& scales) {
   return tree;
 }
 
+ScheduleTree* insertTopLevelEmptyBand(ScheduleTree* root) {
+  auto node = root;
+  if (node->numChildren() > 0 &&
+      node->child({0})->elemAs<detail::ScheduleTreeElemContext>()) {
+    node = node->child({0});
+  }
+  return insertNodeBelow(node, ScheduleTree::makeEmptyBand(root));
+}
+
 void updateTopLevelContext(detail::ScheduleTree* root, isl::set context) {
   if (!matchOne(tc::polyhedral::domain(tc::polyhedral::context(any())), root)) {
     root->appendChild(ScheduleTree::makeContext(
diff --git a/tc/core/polyhedral/schedule_transforms.h b/tc/core/polyhedral/schedule_transforms.h
@@ -102,6 +102,10 @@ detail::ScheduleTree* bandScale(
     detail::ScheduleTree* tree,
     const std::vector<size_t>& scales);
 
+// Insert an empty band node below "root" or below the only child of "root" if
+// the child is a context node.
+detail::ScheduleTree* insertTopLevelEmptyBand(detail::ScheduleTree* root);
+
 // Update the top-level context node by intersecting it with "context".  The
 // top-level context node must be located directly under the root of the tree.
 // If there is no such node, insert one with universe context first.
diff --git a/tc/core/polyhedral/scop.cc b/tc/core/polyhedral/scop.cc
@@ -243,6 +243,13 @@ void Scop::insertSyncsAroundCopies(ScheduleTree* tree) {
       ++i;
     }
   }
+  insertSyncsAroundSeqChildren(seqNode);
+}
+
+void Scop::insertSyncsAroundSeqChildren(detail::ScheduleTree* seqNode) {
+  TC_CHECK(seqNode->elemAs<detail::ScheduleTreeElemSequence>())
+      << "expected sequence node, got\n"
+      << *seqNode;
   insertSync(seqNode, 0);
   insertSync(seqNode, seqNode->numChildren());
 }
diff --git a/tc/core/polyhedral/scop.h b/tc/core/polyhedral/scop.h
@@ -461,6 +461,10 @@ struct Scop {
   //
   void insertSyncsAroundCopies(detail::ScheduleTree* tree);
 
+  // Given a sequence node, insert synchronizations before its first child node
+  // and after its last child node.
+  void insertSyncsAroundSeqChildren(detail::ScheduleTree* tree);
+
  private:
   // Compute a schedule satisfying the given schedule constraints and
   // taking into account the scheduler options.
diff --git a/tc/proto/mapping_options.proto b/tc/proto/mapping_options.proto
@@ -70,6 +70,8 @@ message CudaMappingOptionsProto {
   optional uint64 max_shared_memory = 7;
   // Use the readonly cache (i.e. emit __ldg loads)
   required bool use_readonly_cache = 8;
+  // Depth of promotion to private memory, ignored if use_private_memory is false.
+  optional uint32 private_depth = 9;
 }
 
 message CpuMappingOptionsProto {
diff --git a/test/test_cuda_mapper_memory_promotion.cc b/test/test_cuda_mapper_memory_promotion.cc

Original file line number	Diff line number	Diff line change
`@@ -372,6 +372,8 @@ void setupTuningParameters(`
`372`	`372`	`configuration.gridParams.setRange(range, "g");`
`373`	`373`	`configuration.unrollFactor =`
`374`	`374`	`RangeParameter(powers2(FLAGS_tuner_max_unroll_size), "unroll");`
	`375`	`+ configuration.privateDepth =`
	`376`	`+ RangeParameter({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, "pdepth");`
`375`	`377`	`}`
`376`	`378`	`} // namespace`
`377`	`379`
Original file line number	Diff line number	Diff line change
`@@ -38,6 +38,7 @@ CudaMappingOptionsCppPrinter& operator<<(`
`38`	`38`	`prn.printValueOption(`
`39`	`39`	`"maxSharedMemory", cudaOptions.proto().max_shared_memory());`
`40`	`40`	`}`
	`41`	`+ prn.printValueOption("privateDepth", cudaOptions.proto().private_depth());`
`41`	`42`	`prn.endStmt();`
`42`	`43`	`return prn;`
`43`	`44`	`}`
Original file line number	Diff line number	Diff line change
`@@ -1068,7 +1068,7 @@ std::unique_ptr<MappedScop> MappedScop::makeWithOuterBlockInnerThreadStrategy(`
`1068`	`1068`
`1069`	`1069`	`// 9. Promote to registers below the loops mapped to threads.`
`1070`	`1070`	`if (cudaOptions.proto().use_private_memory()) {`
`1071`		`- promoteToRegistersBelowThreads(*mappedScop, -1ull);`
	`1071`	`+ promoteToRegistersAtDepth(*mappedScop, cudaOptions.proto().private_depth());`
`1072`	`1072`	`}`
`1073`	`1073`
`1074`	`1074`	`LOG_IF(INFO, FLAGS_debug_tc_mapper)`