insert reduction synchronization outside thread mapping

Sven Verdoolaege · Sven Verdoolaege · commit 62fa601d6aab · 2018-05-11T19:06:21.000+02:00
Regular synchronization should never appear underneath a thread mapping
since the synchronization should be performed by all threads and
the mapping to threads may leave some thread instances unmapped.
Inserting reduction synchronization was apparently deemed safe
because the partial tile separation makes sure only complete
blocks are mapped to reductions.
However, by having the synchronization inside the mapping,
the isl AST generator may generate tests outside this synchronization
that involve thread identifiers (even if it is known to
the user that those same conditions could be represented
without involving thread identifiers, in combination with
other constraints in the code).
Insert the synchronization outside the mapping to prevent
this from happening.  This also means that the reduction
member no longer needs to be split off, such that
the thread mapping now always corresponds to a single band.

Note that while the partial tile separation makes sure
that only complete blocks are mapped to reductions,
multiple such complete blocks may still get mapped
by the thread mapping, including in the parallel
directions.  The current reduction handling does not
support this as it stores the partial reductions
in a single (per-thread) scalar variable.
The band mapped to threads therefore needs to be tiled
first such that it contains exactly one complete block
in the parallel directions.
diff --git a/tc/core/polyhedral/cuda/mapped_scop.cc b/tc/core/polyhedral/cuda/mapped_scop.cc
@@ -372,7 +372,8 @@ size_t MappedScop::mapToThreads(detail::ScheduleTree* band) {
     bandSplit(scop_->scheduleRoot(), band, nCanMap - nMappedThreads);
     auto child = band->child({0});
     if (isReduction) {
-      // Update reductionBandUpdates_ such that splitOutReductionAndInsertSyncs
+      // Update reductionBandUpdates_ such that
+      // splitOutReductionTileAndInsertSyncs
       // can find the information it needs.
       reductionBandUpdates_.emplace(child, reductionBandUpdates_.at(band));
       reductionBandUpdates_.erase(band);
@@ -387,12 +388,12 @@ size_t MappedScop::mapToThreads(detail::ScheduleTree* band) {
 
   CHECK_GT(nMappedThreads, 0u) << "not mapping to threads";
 
-  mapThreadsBackward(band);
-
   if (isReduction) {
-    splitOutReductionAndInsertSyncs(band);
+    band = splitOutReductionTileAndInsertSyncs(band);
   }
 
+  mapThreadsBackward(band);
+
   return numThreads.view.size();
 }
 
@@ -946,17 +947,32 @@ std::tuple<std::string, tc::Grid, tc::Block> MappedScop::codegen(
       mappedScopForCodegen->numThreads);
 }
 
-// Split out reduction member in "band" and
-// insert reduction synchronizations outside this split off band.
-void MappedScop::splitOutReductionAndInsertSyncs(
+// Split out a single reduction tile (in the directions other than
+// the reduction) and insert reduction synchronizations outside this tile.
+// Return a pointer to the split off tile.
+detail::ScheduleTree* MappedScop::splitOutReductionTileAndInsertSyncs(
     detail::ScheduleTree* band) {
   using namespace polyhedral::detail;
   size_t n = numThreads.view.size();
 
-  auto tree = bandSplitOut(scop_->scheduleRoot(), band, n - 1);
+  // The current band contains only full blocks.
+  // Split off a band that iterates over these blocks,
+  // such that only a single block gets mapped to thread identifiers.
+  // The mapping to thread identifier X is allowed to iterate
+  // over multiple blocks, so this direction is not tiled.
+  std::vector<size_t> sizes(n);
+  for (size_t i = 1; i < n; ++i) {
+    sizes[n - 1 - i] = numThreads.view[i];
+  }
+  sizes[n - 1] = 0;
+  bandTile(band, sizes, TileOptions::ScaleTileLoops);
+
+  // Insert synchronization outside the single block.
+  auto child = band->child({0});
   for (auto updateId : reductionBandUpdates_.at(band).ids) {
-    scop_->insertReductionSync1D(tree, updateId);
+    scop_->insertReductionSync1D(child, updateId);
   }
+  return child;
 }
 
 std::unique_ptr<MappedScop> MappedScop::makeWithOuterBlockInnerThreadStrategy(
diff --git a/tc/core/polyhedral/cuda/mapped_scop.h b/tc/core/polyhedral/cuda/mapped_scop.h
@@ -182,9 +182,11 @@ class MappedScop {
  private:
   // Insert the optimal combination of synchronizations in the sequence
   void insertBestSyncInSeq(detail::ScheduleTree* seq);
-  // Split out reduction member in "band" and
-  // insert reduction synchronizations.
-  void splitOutReductionAndInsertSyncs(detail::ScheduleTree* band);
+  // Split out a single reduction tile (in the directions other than
+  // the reduction) and insert reduction synchronizations.
+  // Return a pointer to the split off tile.
+  detail::ScheduleTree* splitOutReductionTileAndInsertSyncs(
+      detail::ScheduleTree* band);
   // Map "band" to thread identifiers using as many blockSizes values as outer
   // coincident dimensions (plus reduction dimension, if any),
   // insert synchronization in case of a reduction, and