Merge pull request #418 from facebookresearch/pr/reduction

nicolasvasilache · web-flow · commit 92b874b1897a · 2018-05-11T21:17:04.000-04:00
insert reduction synchronization outside thread mapping
diff --git a/tc/core/polyhedral/cuda/mapped_scop.cc b/tc/core/polyhedral/cuda/mapped_scop.cc
@@ -372,7 +372,8 @@ size_t MappedScop::mapToThreads(detail::ScheduleTree* band) {
     bandSplit(scop_->scheduleRoot(), band, nCanMap - nMappedThreads);
     auto child = band->child({0});
     if (isReduction) {
-      // Update reductionBandUpdates_ such that splitOutReductionAndInsertSyncs
+      // Update reductionBandUpdates_ such that
+      // splitOutReductionTileAndInsertSyncs
       // can find the information it needs.
       reductionBandUpdates_.emplace(child, reductionBandUpdates_.at(band));
       reductionBandUpdates_.erase(band);
@@ -387,12 +388,12 @@ size_t MappedScop::mapToThreads(detail::ScheduleTree* band) {
 
   CHECK_GT(nMappedThreads, 0u) << "not mapping to threads";
 
-  mapThreadsBackward(band);
-
   if (isReduction) {
-    splitOutReductionAndInsertSyncs(band, nMappedThreads - 1);
+    band = splitOutReductionTileAndInsertSyncs(band);
   }
 
+  mapThreadsBackward(band);
+
   return numThreads.view.size();
 }
 
@@ -946,17 +947,32 @@ std::tuple<std::string, tc::Grid, tc::Block> MappedScop::codegen(
       mappedScopForCodegen->numThreads);
 }
 
-// Split out reduction member at position "dim" in "band" and
-// insert reduction synchronizations outside this split off band.
-void MappedScop::splitOutReductionAndInsertSyncs(
-    detail::ScheduleTree* band,
-    int dim) {
+// Split out a single reduction tile (in the directions other than
+// the reduction) and insert reduction synchronizations outside this tile.
+// Return a pointer to the split off tile.
+detail::ScheduleTree* MappedScop::splitOutReductionTileAndInsertSyncs(
+    detail::ScheduleTree* band) {
   using namespace polyhedral::detail;
+  size_t n = numThreads.view.size();
+
+  // The current band contains only full blocks.
+  // Split off a band that iterates over these blocks,
+  // such that only a single block gets mapped to thread identifiers.
+  // The mapping to thread identifier X is allowed to iterate
+  // over multiple blocks, so this direction is not tiled.
+  std::vector<size_t> sizes(n);
+  for (size_t i = 1; i < n; ++i) {
+    sizes[n - 1 - i] = numThreads.view[i];
+  }
+  sizes[n - 1] = 0;
+  bandTile(band, sizes, TileOptions::ScaleTileLoops);
 
-  auto tree = bandSplitOut(scop_->scheduleRoot(), band, dim);
+  // Insert synchronization outside the single block.
+  auto child = band->child({0});
   for (auto updateId : reductionBandUpdates_.at(band).ids) {
-    scop_->insertReductionSync1D(tree, updateId);
+    scop_->insertReductionSync1D(child, updateId);
   }
+  return child;
 }
 
 std::unique_ptr<MappedScop> MappedScop::makeWithOuterBlockInnerThreadStrategy(
diff --git a/tc/core/polyhedral/cuda/mapped_scop.h b/tc/core/polyhedral/cuda/mapped_scop.h
@@ -182,11 +182,11 @@ class MappedScop {
  private:
   // Insert the optimal combination of synchronizations in the sequence
   void insertBestSyncInSeq(detail::ScheduleTree* seq);
-  // Split out reduction bands and insert reduction synchronizations.
-  void splitOutReductionsAndInsertSyncs();
-  // Split out reduction member at position "dim" in "band" and
-  // insert reduction synchronizations.
-  void splitOutReductionAndInsertSyncs(detail::ScheduleTree* band, int dim);
+  // Split out a single reduction tile (in the directions other than
+  // the reduction) and insert reduction synchronizations.
+  // Return a pointer to the split off tile.
+  detail::ScheduleTree* splitOutReductionTileAndInsertSyncs(
+      detail::ScheduleTree* band);
   // Map "band" to thread identifiers using as many blockSizes values as outer
   // coincident dimensions (plus reduction dimension, if any),
   // insert synchronization in case of a reduction, and