memory promotion: handle strides

ftynse · Sven Verdoolaege · commit 14d0ca9c71c8 · 2018-06-07T11:20:10.000+02:00
The original implementation of memory promotion ignored strides in
accesses to simplify the code.  isl recently introduced support for
extracting stride information from sets, making stride manipulation easy
in TC.  Introduce special handling for strided support into
TensorReferenceGroup and related classes.  An access is strided if the
access function has the shape

  (a_i - offset_i) = 0 mod stride_i

where stride_i is a constant and offset_i is some affine expression on
the iteration domain.  Use isl to compute offsets and strides in access
relations.  Use this information to promote to shared memory only those
tensor elements that are actually read in case of strided accesses.
This decreases the amount of shared memory used by a kernel with such
accesses.

This also prepares for the introduction of register promotion where
accesses of each thread individually are strided with the stride equal
to the number of threads.

Note that references accessing disjoint sets of elements with strides
are not grouped even if their non-strided footprints overlap, e.g.
A[2*i] and A[2*i + 1] belong to different groups.  This may decrease the
benefit of coalesced reads when copying between global and shared
memory.  At the same time, it also decreases the required shared memory
size making it to promote one of the references in case where a group
with two references would not fit.  The profitability of such grouping
requires further exploration.
diff --git a/tc/core/polyhedral/memory_promotion.cc b/tc/core/polyhedral/memory_promotion.cc
@@ -31,10 +31,50 @@ namespace polyhedral {
 using detail::ScheduleTree;
 
 namespace {
+// Remove strides specified by "strides" and "offsets" from the range of
+// "relation".  In particular, relation has a shape
+//
+//   D -> O: o_i = offset_i + stride_i * f(D)
+//
+// transform it into
+//
+//   D -> O: o_i = f(D)
+//
+// by subtracting "offsets" and by dividing the result by "strides".
+isl::map removeRangeStrides(
+    isl::map relation,
+    isl::multi_val strides,
+    isl::multi_aff offsets) {
+  CHECK_EQ(strides.size(), offsets.size());
+
+  auto space = relation.get_space();
+  auto stridesMA = isl::multi_aff::identity(space.range().map_from_set());
+  stridesMA = stridesMA / strides;
+
+  return relation.sum(isl::map(offsets.neg())).apply_range(isl::map(stridesMA));
+}
+
+// Compute a box approximation of the range of the given relation,
+// including the lower bounds, the box sizes, and the strides.
+// If the range has strides, remove them first.
 ScopedFootprint outputRanges(isl::map access) {
+  auto ctx = access.get_ctx();
+  int nSubscripts = access.dim(isl::dim_type::out);
+
+  auto strides = isl::val_list(ctx, nSubscripts);
+  auto strideOffsets = isl::aff_list(ctx, nSubscripts);
+  for (int i = 0; i < nSubscripts; ++i) {
+    auto si = access.get_range_stride_info(i);
+    strides = strides.add(si.get_stride());
+    strideOffsets = strideOffsets.add(si.get_offset());
+  }
+
   ScopedFootprint footprint;
+  footprint.strideValues = isl::multi_val(access.get_space().range(), strides);
+  footprint.strideOffsets = isl::multi_aff(access.get_space(), strideOffsets);
 
-  // TODO: also compute strides
+  access = removeRangeStrides(
+      access, footprint.strideValues, footprint.strideOffsets);
 
   footprint.box = access.get_range_simple_fixed_box_hull();
   return footprint;
@@ -84,10 +124,16 @@ isl::set TensorReferenceGroup::approximateFootprint() const {
   auto lspace = isl::local_space(accessed.get_space().range());
 
   for (size_t i = 0; i < approximation.dim(); ++i) {
-    auto dimLowerBound = approximation.lowerBound(i);
+    auto offset = approximation.lowerBound(i);
+    auto stride = approximation.stride(i);
+    auto strideOffset = approximation.strideOffset(i);
+    auto size = approximation.size(i);
     auto rhs = isl::aff(lspace, isl::dim_type::set, i);
-    isl::map partial = (isl::aff_map(dimLowerBound) <= rhs) &
-        (isl::aff_map(dimLowerBound + approximation.size(i)) > rhs);
+    auto lowerBound = offset * stride + strideOffset;
+    auto upperBound = (offset + size) * stride + strideOffset;
+    auto partial =
+        (isl::aff_map(lowerBound) <= rhs) & (isl::aff_map(upperBound) > rhs);
+
     accessed = accessed & partial;
   }
   return accessed.range();
@@ -304,7 +350,9 @@ TensorGroups TensorReferenceGroup::accessedBySubtree(
 
 // Compute the relation between schedule dimensions, original and promoted array
 // subscripts, in the space
-//   [S -> O] -> P
+//   [S -> O] -> O.
+// The caller is in charge of updating the tuple of the target space with the
+// group identifier.
 // The mapping depends on the original schedule dimensions because the same
 // elements of the promoted array get assigned different values of the original
 // array in different outer loop iterations; it's impossible to project out the
@@ -314,10 +362,20 @@ isl::multi_aff TensorReferenceGroup::promotion() const {
   isl::map map = scopedAccesses();
   auto accessSpace = map.get_space();
 
-  // lower bounds space is S -> P; which we transform into [S -> O] -> P
-  auto lowerBounds = approximation.lowerBounds().pullback(
-      isl::multi_aff::domain_map(accessSpace));
-  auto promotion = isl::multi_aff::range_map(accessSpace) - lowerBounds;
+  // Construct a projection multi-aff in [S -> O] -> S
+  // for further precomposition.
+  auto originalSpaceInserter = isl::multi_aff::domain_map(accessSpace);
+
+  // Lower bounds and offsets space is S -> O; transform into [S -> O] -> O.
+  auto lowerBounds =
+      approximation.lowerBounds().pullback(originalSpaceInserter);
+  auto offsets = approximation.strideOffsets.pullback(originalSpaceInserter);
+
+  // Create promotion starting by identity in [S -> O] -> O.
+  auto original = isl::multi_aff::range_map(accessSpace);
+  auto promotion =
+      (original - offsets) / approximation.strideValues - lowerBounds;
+
   return promotion;
 }
 
diff --git a/tc/core/polyhedral/memory_promotion.h b/tc/core/polyhedral/memory_promotion.h
@@ -33,8 +33,10 @@ enum class AccessType : short { Read, Write };
 // Rectangular overapproximation of a tensor elements accessed through a single
 // reference.
 // Each dimension is overapproximated by a lower bound, an affine function of
-// parameters and schedule dimensions visible around the scope, and by a
-// constant size.
+// parameters and schedule dimensions visible around the scope, by a
+// constant size, and by a pair offset/stride for strided accesses.  If the
+// access is not strided, then "offset" is a zero expression and "stride" is 1.
+// The lowerBound and the size are computed after removing the potential stride.
 // The scope is defined by a specific position in a schedule tree (const
 // ScheduleTree*), the user is responsible for maintaining the correspondance
 // between schedule tree positions and footprints.
@@ -48,7 +50,17 @@ struct ScopedFootprint {
   isl::aff lowerBound(size_t pos) const {
     return box.get_offset().get_aff(pos);
   }
+  isl::val stride(size_t pos) const {
+    return strideValues.get_val(pos);
+  }
+  isl::aff strideOffset(size_t pos) const {
+    return strideOffsets.get_aff(pos);
+  }
+
   isl::fixed_box box;
+  isl::multi_val strideValues;
+  isl::multi_aff strideOffsets;
+
   isl::multi_aff lowerBounds() const;
 };
 
diff --git a/test/test_cuda_mapper_memory_promotion.cc b/test/test_cuda_mapper_memory_promotion.cc
@@ -504,6 +504,114 @@ TEST_F(MatMulBias, RegisterPromotionSharedPreference) {
       << "tensor A promoted to register but has elements accessed by multiple threads";
 }
 
+class Strided : public TestMapper {
+ public:
+  std::unique_ptr<MappedScop> makeScopAndCheck(
+      const std::string& tc,
+      const std::unordered_map<std::string, size_t>& sizes,
+      long groupISize,
+      long groupIStride,
+      long groupIConstOffset) {
+    auto options = CudaMappingOptions::makeNaiveMappingOptions()
+                       .tile(32, 32)
+                       .mapToThreads(21, 21)
+                       .useSharedMemory(false);
+    auto mscop = makeMappedScop(tc, options, sizes);
+    auto& scop = mscop->scop();
+    auto ctx = scop.domain().get_ctx();
+
+    auto groups = TensorReferenceGroup::accessedBySubtree(
+        scop.scheduleRoot()->child({0, 0, 0}), scop);
+    EXPECT_EQ(groups.size(), 2u) << "expected groups for both tensors";
+
+    for (const auto& g : groups) {
+      auto name = g.first.get_name();
+      if (name != "I") {
+        continue;
+      }
+
+      const auto& perTensorGroups = g.second;
+      // One cannot use ASSERT_EQ in a function that returns something, because
+      // it would trigger an immediate return without value.  Use EXPECT_EQ and
+      // return nullptr manually.
+      EXPECT_EQ(perTensorGroups.size(), 1u) << "expected one group for I";
+      if (perTensorGroups.size() != 1u) {
+        return nullptr;
+      }
+      const auto& oneGroup = perTensorGroups[0];
+
+      EXPECT_EQ(oneGroup->references.size(), 1u)
+          << "expected one reference in the group for I";
+      if (oneGroup->references.size() != 1u) {
+        return nullptr;
+      }
+      const auto& ref = oneGroup->references[0];
+
+      EXPECT_EQ(oneGroup->approximation.dim(), 2u)
+          << "could not compute approximation for " << ref->scopedAccess;
+
+      EXPECT_EQ(oneGroup->approximation.size(1), isl::val(ctx, groupISize))
+          << "expected strides to be removed";
+
+      isl::val stride = isl::val(ctx, groupIStride);
+      EXPECT_EQ(oneGroup->approximation.stride(1), stride);
+
+      auto expectedOffset =
+          isl::aff::zero_on_domain(ref->scopedAccess.domain().get_space()) +
+          groupIConstOffset;
+      // Convert to pw_aff because it has is_equal whereas a simple aff only has
+      // is_plain_equal that fails here.
+      EXPECT_TRUE(
+          isl::pw_aff(oneGroup->approximation.strideOffset(1).mod(stride))
+              .is_equal(isl::pw_aff(expectedOffset.mod(stride))))
+          << oneGroup->approximation.strideOffset(1) << "\n"
+          << expectedOffset;
+    }
+    return mscop;
+  }
+};
+
+// Check that strides are effectively handled in memory promotion.  In
+// particular, check that array elements that are jumped over
+// by the main computation are not copied into shared memory.
+TEST_F(Strided, Stride2) {
+  std::string tc = R"TC(
+def strided(float(N,M) I) -> (O) {
+  O(i, j) = I(j, 2 * i + 1)
+}
+)TC";
+
+  // Expect the promoted size to be 32x32, with stride 2 and offset -1 along
+  // the second dimension.
+  auto mscop = makeScopAndCheck(tc, {{"N", 42}, {"M", 420}}, 32, 2, -1);
+  ASSERT_TRUE(mscop.get() != nullptr);
+  auto& scop = mscop->scop();
+
+  // Additionally check that copies look fine.
+  scop.promoteEverythingAt({0, 0, 0});
+  auto code = std::get<0>(mscop->codegen("strided"));
+  EXPECT_TRUE(
+      code.find("_I_0[c2][c3] = I[32 * b1 + c2][64 * b0 + 2 * c3 + 1]") !=
+      std::string::npos)
+      << "expected strided accesses to global array in copies";
+  EXPECT_TRUE(code.find("= _I_0[c3][c2]") != std::string::npos)
+      << "expected non-strided access to promoted array in main computation";
+  EXPECT_TRUE(code.find("= _I_0[c3][2 * c2") == std::string::npos)
+      << "did not expect strided access to promoted array in main computation";
+}
+
+TEST_F(Strided, Stride5) {
+  std::string tc = R"TC(
+def strided(float(N,M) I) -> (O) {
+  O(i, j) = I(j, 5 * i)
+}
+)TC";
+
+  // Expect the promoted size to be 32x32, with stride 5 and offset 0 along
+  // the second dimension.
+  makeScopAndCheck(tc, {{"N", 42}, {"M", 420}}, 32, 5, 0);
+}
+
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
   ::gflags::ParseCommandLineFlags(&argc, &argv, true);