Merge pull request #472 from facebookresearch/strided-access

ftynse · web-flow · commit 3abaacc57e2f · 2018-06-07T12:21:51.000+02:00
Support strided access in shared memory promotion
diff --git a/tc/core/polyhedral/memory_promotion.cc b/tc/core/polyhedral/memory_promotion.cc
@@ -31,10 +31,50 @@ namespace polyhedral {
 using detail::ScheduleTree;
 
 namespace {
+// Remove strides specified by "strides" and "offsets" from the range of
+// "relation".  In particular, relation has a shape
+//
+//   D -> O: o_i = offset_i + stride_i * f(D)
+//
+// transform it into
+//
+//   D -> O: o_i = f(D)
+//
+// by subtracting "offsets" and by dividing the result by "strides".
+isl::map removeRangeStrides(
+    isl::map relation,
+    isl::multi_val strides,
+    isl::multi_aff offsets) {
+  CHECK_EQ(strides.size(), offsets.size());
+
+  auto space = relation.get_space();
+  auto stridesMA = isl::multi_aff::identity(space.range().map_from_set());
+  stridesMA = stridesMA / strides;
+
+  return relation.sum(isl::map(offsets.neg())).apply_range(isl::map(stridesMA));
+}
+
+// Compute a box approximation of the range of the given relation,
+// including the lower bounds, the box sizes, and the strides.
+// If the range has strides, remove them first.
 ScopedFootprint outputRanges(isl::map access) {
+  auto ctx = access.get_ctx();
+  int nSubscripts = access.dim(isl::dim_type::out);
+
+  auto strides = isl::val_list(ctx, nSubscripts);
+  auto strideOffsets = isl::aff_list(ctx, nSubscripts);
+  for (int i = 0; i < nSubscripts; ++i) {
+    auto si = access.get_range_stride_info(i);
+    strides = strides.add(si.get_stride());
+    strideOffsets = strideOffsets.add(si.get_offset());
+  }
+
   ScopedFootprint footprint;
+  footprint.strideValues = isl::multi_val(access.get_space().range(), strides);
+  footprint.strideOffsets = isl::multi_aff(access.get_space(), strideOffsets);
 
-  // TODO: also compute strides
+  access = removeRangeStrides(
+      access, footprint.strideValues, footprint.strideOffsets);
 
   footprint.box = access.get_range_simple_fixed_box_hull();
   return footprint;
@@ -77,16 +117,23 @@ std::unique_ptr<TensorReferenceGroup> TensorReferenceGroup::makeSingleton(
   return group;
 }
 
-isl::set ScopedFootprint::footprint(isl::set domain) const {
-  auto space = box.get_space();
-  auto accessed = isl::map::universe(space).intersect_domain(domain);
+isl::set TensorReferenceGroup::approximateFootprint() const {
+  auto scopedDomain = scopedAccesses().domain();
+  auto space = approximation.box.get_space();
+  auto accessed = isl::map::universe(space).intersect_domain(scopedDomain);
   auto lspace = isl::local_space(accessed.get_space().range());
 
-  for (size_t i = 0; i < dim(); ++i) {
-    auto dimLowerBound = lowerBound(i);
+  for (size_t i = 0; i < approximation.dim(); ++i) {
+    auto offset = approximation.lowerBound(i);
+    auto stride = approximation.stride(i);
+    auto strideOffset = approximation.strideOffset(i);
+    auto size = approximation.size(i);
     auto rhs = isl::aff(lspace, isl::dim_type::set, i);
-    isl::map partial = (isl::aff_map(dimLowerBound) <= rhs) &
-        (isl::aff_map(dimLowerBound + size(i)) > rhs);
+    auto lowerBound = offset * stride + strideOffset;
+    auto upperBound = (offset + size) * stride + strideOffset;
+    auto partial =
+        (isl::aff_map(lowerBound) <= rhs) & (isl::aff_map(upperBound) > rhs);
+
     accessed = accessed & partial;
   }
   return accessed.range();
@@ -303,7 +350,9 @@ TensorGroups TensorReferenceGroup::accessedBySubtree(
 
 // Compute the relation between schedule dimensions, original and promoted array
 // subscripts, in the space
-//   [S -> O] -> P
+//   [S -> O] -> O.
+// The caller is in charge of updating the tuple of the target space with the
+// group identifier.
 // The mapping depends on the original schedule dimensions because the same
 // elements of the promoted array get assigned different values of the original
 // array in different outer loop iterations; it's impossible to project out the
@@ -313,10 +362,20 @@ isl::multi_aff TensorReferenceGroup::promotion() const {
   isl::map map = scopedAccesses();
   auto accessSpace = map.get_space();
 
-  // lower bounds space is S -> P; which we transform into [S -> O] -> P
-  auto lowerBounds = approximation.lowerBounds().pullback(
-      isl::multi_aff::domain_map(accessSpace));
-  auto promotion = isl::multi_aff::range_map(accessSpace) - lowerBounds;
+  // Construct a projection multi-aff in [S -> O] -> S
+  // for further precomposition.
+  auto originalSpaceInserter = isl::multi_aff::domain_map(accessSpace);
+
+  // Lower bounds and offsets space is S -> O; transform into [S -> O] -> O.
+  auto lowerBounds =
+      approximation.lowerBounds().pullback(originalSpaceInserter);
+  auto offsets = approximation.strideOffsets.pullback(originalSpaceInserter);
+
+  // Create promotion starting by identity in [S -> O] -> O.
+  auto original = isl::multi_aff::range_map(accessSpace);
+  auto promotion =
+      (original - offsets) / approximation.strideValues - lowerBounds;
+
   return promotion;
 }
 
diff --git a/tc/core/polyhedral/memory_promotion.h b/tc/core/polyhedral/memory_promotion.h
@@ -33,8 +33,10 @@ enum class AccessType : short { Read, Write };
 // Rectangular overapproximation of a tensor elements accessed through a single
 // reference.
 // Each dimension is overapproximated by a lower bound, an affine function of
-// parameters and schedule dimensions visible around the scope, and by a
-// constant size.
+// parameters and schedule dimensions visible around the scope, by a
+// constant size, and by a pair offset/stride for strided accesses.  If the
+// access is not strided, then "offset" is a zero expression and "stride" is 1.
+// The lowerBound and the size are computed after removing the potential stride.
 // The scope is defined by a specific position in a schedule tree (const
 // ScheduleTree*), the user is responsible for maintaining the correspondance
 // between schedule tree positions and footprints.
@@ -48,8 +50,17 @@ struct ScopedFootprint {
   isl::aff lowerBound(size_t pos) const {
     return box.get_offset().get_aff(pos);
   }
+  isl::val stride(size_t pos) const {
+    return strideValues.get_val(pos);
+  }
+  isl::aff strideOffset(size_t pos) const {
+    return strideOffsets.get_aff(pos);
+  }
+
   isl::fixed_box box;
-  isl::set footprint(isl::set domain) const;
+  isl::multi_val strideValues;
+  isl::multi_aff strideOffsets;
+
   isl::multi_aff lowerBounds() const;
 };
 
@@ -131,9 +142,7 @@ class TensorReferenceGroup {
 
   // Rectangular overapproximation of the set of tensor elements accessed below
   // the scoping point.
-  isl::set approximateFootprint() const {
-    return approximation.footprint(scopedAccesses().domain());
-  }
+  isl::set approximateFootprint() const;
 
   isl::multi_aff promotion() const;
   isl::set promotedFootprint() const;
diff --git a/tc/core/polyhedral/schedule_tree_matcher-inl.h b/tc/core/polyhedral/schedule_tree_matcher-inl.h
@@ -61,6 +61,12 @@ inline ScheduleTreeMatcher context(Args... children) {
   return ScheduleTreeMatcher(detail::ScheduleTreeType::Context, children...);
 }
 
+template <typename... Args>
+inline ScheduleTreeMatcher threadSpecific(Args... children) {
+  return ScheduleTreeMatcher(
+      detail::ScheduleTreeType::ThreadSpecificMarker, children...);
+}
+
 template <typename... Args>
 inline ScheduleTreeMatcher filter(
     std::function<bool(isl::union_set)> propertyMatcher,
diff --git a/tc/external/detail/islpp-inl.h b/tc/external/detail/islpp-inl.h
@@ -185,6 +185,13 @@ inline isl::map operator<=(isl::aff_map A, isl::aff B) {
   return A < B + 1;
 }
 
+///////////////////////////////////////////////////////////////////////////////
+// Operations on isl::multi_aff
+///////////////////////////////////////////////////////////////////////////////
+inline isl::multi_aff operator/(isl::multi_aff left, isl::multi_val right) {
+  return left.scale_down(right);
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 // Operations on isl::set and isl::union_set
 ///////////////////////////////////////////////////////////////////////////////
diff --git a/tc/external/detail/islpp.h b/tc/external/detail/islpp.h
@@ -176,6 +176,11 @@ isl::map operator<=(isl::aff_map A, isl::aff B);
 isl::map operator>(isl::aff_map A, isl::aff B);
 isl::map operator<(isl::aff_map A, isl::aff B);
 
+///////////////////////////////////////////////////////////////////////////////
+// Operations on isl::multi_aff
+///////////////////////////////////////////////////////////////////////////////
+isl::multi_aff operator/(isl::multi_aff left, isl::multi_val right);
+
 ///////////////////////////////////////////////////////////////////////////////
 // Operations on isl::set and isl::union_set
 ///////////////////////////////////////////////////////////////////////////////
diff --git a/test/test_cuda_mapper_memory_promotion.cc b/test/test_cuda_mapper_memory_promotion.cc
@@ -504,6 +504,114 @@ TEST_F(MatMulBias, RegisterPromotionSharedPreference) {
       << "tensor A promoted to register but has elements accessed by multiple threads";
 }
 
+class Strided : public TestMapper {
+ public:
+  std::unique_ptr<MappedScop> makeScopAndCheck(
+      const std::string& tc,
+      const std::unordered_map<std::string, size_t>& sizes,
+      long groupISize,
+      long groupIStride,
+      long groupIConstOffset) {
+    auto options = CudaMappingOptions::makeNaiveMappingOptions()
+                       .tile(32, 32)
+                       .mapToThreads(21, 21)
+                       .useSharedMemory(false);
+    auto mscop = makeMappedScop(tc, options, sizes);
+    auto& scop = mscop->scop();
+    auto ctx = scop.domain().get_ctx();
+
+    auto groups = TensorReferenceGroup::accessedBySubtree(
+        scop.scheduleRoot()->child({0, 0, 0}), scop);
+    EXPECT_EQ(groups.size(), 2u) << "expected groups for both tensors";
+
+    for (const auto& g : groups) {
+      auto name = g.first.get_name();
+      if (name != "I") {
+        continue;
+      }
+
+      const auto& perTensorGroups = g.second;
+      // One cannot use ASSERT_EQ in a function that returns something, because
+      // it would trigger an immediate return without value.  Use EXPECT_EQ and
+      // return nullptr manually.
+      EXPECT_EQ(perTensorGroups.size(), 1u) << "expected one group for I";
+      if (perTensorGroups.size() != 1u) {
+        return nullptr;
+      }
+      const auto& oneGroup = perTensorGroups[0];
+
+      EXPECT_EQ(oneGroup->references.size(), 1u)
+          << "expected one reference in the group for I";
+      if (oneGroup->references.size() != 1u) {
+        return nullptr;
+      }
+      const auto& ref = oneGroup->references[0];
+
+      EXPECT_EQ(oneGroup->approximation.dim(), 2u)
+          << "could not compute approximation for " << ref->scopedAccess;
+
+      EXPECT_EQ(oneGroup->approximation.size(1), isl::val(ctx, groupISize))
+          << "expected strides to be removed";
+
+      isl::val stride = isl::val(ctx, groupIStride);
+      EXPECT_EQ(oneGroup->approximation.stride(1), stride);
+
+      auto expectedOffset =
+          isl::aff::zero_on_domain(ref->scopedAccess.domain().get_space()) +
+          groupIConstOffset;
+      // Convert to pw_aff because it has is_equal whereas a simple aff only has
+      // is_plain_equal that fails here.
+      EXPECT_TRUE(
+          isl::pw_aff(oneGroup->approximation.strideOffset(1).mod(stride))
+              .is_equal(isl::pw_aff(expectedOffset.mod(stride))))
+          << oneGroup->approximation.strideOffset(1) << "\n"
+          << expectedOffset;
+    }
+    return mscop;
+  }
+};
+
+// Check that strides are effectively handled in memory promotion.  In
+// particular, check that array elements that are jumped over
+// by the main computation are not copied into shared memory.
+TEST_F(Strided, Stride2) {
+  std::string tc = R"TC(
+def strided(float(N,M) I) -> (O) {
+  O(i, j) = I(j, 2 * i + 1)
+}
+)TC";
+
+  // Expect the promoted size to be 32x32, with stride 2 and offset -1 along
+  // the second dimension.
+  auto mscop = makeScopAndCheck(tc, {{"N", 42}, {"M", 420}}, 32, 2, -1);
+  ASSERT_TRUE(mscop.get() != nullptr);
+  auto& scop = mscop->scop();
+
+  // Additionally check that copies look fine.
+  scop.promoteEverythingAt({0, 0, 0});
+  auto code = std::get<0>(mscop->codegen("strided"));
+  EXPECT_TRUE(
+      code.find("_I_0[c2][c3] = I[32 * b1 + c2][64 * b0 + 2 * c3 + 1]") !=
+      std::string::npos)
+      << "expected strided accesses to global array in copies";
+  EXPECT_TRUE(code.find("= _I_0[c3][c2]") != std::string::npos)
+      << "expected non-strided access to promoted array in main computation";
+  EXPECT_TRUE(code.find("= _I_0[c3][2 * c2") == std::string::npos)
+      << "did not expect strided access to promoted array in main computation";
+}
+
+TEST_F(Strided, Stride5) {
+  std::string tc = R"TC(
+def strided(float(N,M) I) -> (O) {
+  O(i, j) = I(j, 5 * i)
+}
+)TC";
+
+  // Expect the promoted size to be 32x32, with stride 5 and offset 0 along
+  // the second dimension.
+  makeScopAndCheck(tc, {{"N", 42}, {"M", 420}}, 32, 5, 0);
+}
+
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
   ::gflags::ParseCommandLineFlags(&argc, &argv, true);