check that all accesses to registers have constant index expressions

Sven Verdoolaege · Sven Verdoolaege · commit 460375083946 · 2018-06-28T11:04:42.000+02:00
The index expressions of accesses to arrays of registers should
be constant since registers are not addressable.
Failing to use constant index expressions would result
in accesses to "local" memory, which is much slower than registers.
Some MatMulBias tests checked for particular patterns that should
not appear when the index expressions are constant, but they did
not check that they are actually constants.
Replace these checks by generic checks during emitCudaKernel
that explicitly check for constant index expressions and
that get applied to all accesses to registers.
diff --git a/tc/core/polyhedral/cuda/codegen.cc b/tc/core/polyhedral/cuda/codegen.cc
@@ -410,10 +410,33 @@ void emitAccess(AFF access, const CodegenStatementContext& context) {
   emitAccess(buildAccess(access, context), context);
 }
 
+// Check that the given expression is an access with constant index expressions
+void checkConstantAccess(isl::ast_expr expr) {
+  auto op = expr.as<isl::ast_expr_op>();
+  auto access = op.as<isl::ast_op_access>();
+  TC_CHECK(access);
+  for (int i = 1; i < access.get_n_arg(); ++i) {
+    auto arg = access.get_arg(i);
+    TC_CHECK(arg.as<isl::ast_expr_int>())
+        << "expected constant subscript, got " << arg.to_C_str();
+  }
+}
+
+// Print an access to a(n array of) register(s), checking that
+// the index expressions are constant.
+void emitRegisterAccess(
+    isl::pw_multi_aff access,
+    const CodegenStatementContext& context) {
+  auto expr = buildAccess(access, context);
+  checkConstantAccess(expr);
+  emitAccess(expr, context);
+}
+
 // Print an access to global memory, wrapping the access in an "__ldg()"
 // call if the accessed tensor is known to be read-only.
-template <typename AFF>
-void emitGlobalAccess(AFF access, const CodegenStatementContext& context) {
+void emitGlobalAccess(
+    isl::multi_pw_aff access,
+    const CodegenStatementContext& context) {
   LdgWrapper ldgWrapper(context, access.get_tuple_id(isl::dim_type::out));
   emitAccess(access, context);
 }
@@ -641,7 +664,8 @@ void emitMappedTensorAccess(
     return;
   }
 
-  auto tensorId = context.scop().promotedDecl(promotionInfo.groupId).tensorId;
+  auto decl = context.scop().promotedDecl(promotionInfo.groupId);
+  auto tensorId = decl.tensorId;
 
   // Here and below in comments: D = domain, O = original tensor, P = promoted
   // tensor, S = partial schedule, A = AST loops;
@@ -667,7 +691,11 @@ void emitMappedTensorAccess(
   auto astToPromoted =
       isl::pw_multi_aff(promotion).pullback(astToScheduledOriginal);
 
-  emitAccess(astToPromoted, context);
+  if (decl.kind == Scop::PromotedDecl::Kind::Register) {
+    emitRegisterAccess(astToPromoted, context);
+  } else {
+    emitAccess(astToPromoted, context);
+  }
 }
 
 } // namespace detail
diff --git a/test/test_cuda_mapper_memory_promotion.cc b/test/test_cuda_mapper_memory_promotion.cc
@@ -485,19 +485,6 @@ def fun(float(N,K) A, float(K,M) B, float(N,M) C) -> (O) {
     EXPECT_TRUE(cDeclPos == std::string::npos)
         << "tensor C promoted to register but has no reuse";
   }
-
-  void expectNoSymbolicSubscript(const std::string& code) {
-    // We don't know the exact name of the iterator, but it starts with c.
-    auto oWithIteratorPos = code.find("_O_0[c");
-    auto oWithThreadPos = code.find("_O_0[t1");
-
-    EXPECT_TRUE(oWithIteratorPos == std::string::npos)
-        << "accessing local arrays with iterators in subscripts makes "
-        << "these arrays placed in local memory instead of registers";
-    EXPECT_TRUE(oWithThreadPos == std::string::npos)
-        << "expected per-thread groups to be computed, i.e. thread "
-        << "identifiers should not appear in the subscripts";
-  }
 };
 
 TEST_F(MatMulBias, RegisterPromotion) {
@@ -562,7 +549,6 @@ TEST_F(MatMulBias, RegistersAtRoot) {
       << "expected O to be promoted to registers";
 
   expectNoABCPromotion(code);
-  expectNoSymbolicSubscript(code);
 
   auto o00Pos = code.find("_O_0[0][0]");
   auto o10Pos = code.find("_O_0[1][0]");
@@ -597,7 +583,6 @@ TEST_F(MatMulBias, RegistersAtRootNotEnoughUnroll) {
       << "not expected O to be promoted to registers";
 
   expectNoABCPromotion(code);
-  expectNoSymbolicSubscript(code);
 }
 
 TEST_F(MatMulBias, RegistersBelowFirstBand) {
@@ -621,7 +606,6 @@ TEST_F(MatMulBias, RegistersBelowFirstBand) {
   EXPECT_TRUE(oDeclPos != std::string::npos)
       << "expected O to be promoted to registers";
   expectNoABCPromotion(code);
-  expectNoSymbolicSubscript(code);
 }
 
 class Strided : public TestMapper {