Merge pull request #248 from nicolasvasilache/pr/cuda-input-const-prt

nicolasvasilache · web-flow · commit f418bfa55cdf · 2018-04-03T15:55:11.000+02:00
Use const pointers for input tensors in generated cuda
diff --git a/include/tc/lang/sema.h b/include/tc/lang/sema.h
@@ -155,6 +155,7 @@ static inline TreeRef match_types(TreeRef a, TreeRef b) {
 /// - replace TK_APPLY with TK_BUILT_IN for built in functions
 /// - checks that all variables are defined, and creates index/reduction
 /// variable objects.
+/// - checks that input variables are readonly.
 struct Sema {
   std::unordered_map<TreeRef, TreeRef> expr_to_type;
 
@@ -349,10 +350,13 @@ struct Sema {
       }
     }
 
-    for (auto p : func.params())
+    for (auto p : func.params()) {
       nonTemporaries.insert(p.ident().name());
-    for (auto r : func.returns())
+      inputParameters.insert(p.ident().name());
+    }
+    for (auto r : func.returns()) {
       nonTemporaries.insert(r.ident().name());
+    }
 
     auto statements_ =
         checkList(func.statements(), [&](TreeRef r) { return checkStmt(r); });
@@ -445,6 +449,9 @@ struct Sema {
 
     // make dimension variables for each dimension of the output tensor
     std::string name = stmt.ident().name();
+    if (inputParameters.count(name) > 0) {
+      throw ErrorReport(stmt_) << "TC inputs are immutable";
+    }
     TreeList output_indices;
     int n = stmt.indices().size();
     for (int i = 0; i < n; ++i) {
@@ -614,6 +621,7 @@ struct Sema {
   // allowed
   std::unordered_set<std::string> live_input_names;
 
+  std::unordered_set<std::string> inputParameters;
   std::unordered_set<std::string> nonTemporaries;
 };
 } // namespace lang
diff --git a/src/core/polyhedral/cuda/codegen.cc b/src/core/polyhedral/cuda/codegen.cc
@@ -108,9 +108,12 @@ vector<string> emitParams(const Scop& scop) {
 }
 
 // Returns number of names printed, i.e. tensors.size().
-string emitTypedTensorName(Halide::OutputImageParam t) {
+string emitTypedTensorName(
+    Halide::OutputImageParam t,
+    bool constInput = false) {
   stringstream ss;
-  ss << t.type() << "* " << makePointerName(t.name());
+  ss << (constInput ? "const " : "") << t.type() << "* "
+     << makePointerName(t.name());
   return ss.str();
 }
 
@@ -128,7 +131,7 @@ vector<string> emitTypedTensorNames(const vector<Halide::ImageParam>& tensors) {
   vector<string> res;
   res.reserve(tensors.size());
   for (auto t : tensors) {
-    res.push_back(emitTypedTensorName(t));
+    res.push_back(emitTypedTensorName(t, true));
   }
   return res;
 }
@@ -179,7 +182,8 @@ void emitKernelSignature(
 void emitTensorView(
     stringstream& ss,
     Halide::OutputImageParam p,
-    const map<string, Halide::Expr>& paramValues) {
+    const map<string, Halide::Expr>& paramValues,
+    bool constInput = false) {
   WS ws;
   stringstream ssViewType;
   for (int i = 1; i < p.dimensions(); ++i) { // Skip the outermost dimension
@@ -190,9 +194,11 @@ void emitTensorView(
     ssViewType << "[" << extent << "]";
   }
   ss << ws.tab();
-  ss << p.type() << " (*" << p.name() << ")" << ssViewType.str();
+  ss << (constInput ? "const " : "") << p.type() << " (*" << p.name() << ")"
+     << ssViewType.str();
   ss << " = ";
-  ss << "reinterpret_cast<" << p.type() << " (*)" << ssViewType.str() << ">";
+  ss << "reinterpret_cast<" << (constInput ? "const " : "") << p.type()
+     << " (*)" << ssViewType.str() << ">";
   ss << "(" << makePointerName(p.name()) << ")";
   ss << ";";
   ss << endl;
@@ -212,7 +218,7 @@ void emitTensorViews(
     const vector<Halide::ImageParam>& params,
     const map<string, Halide::Expr>& paramValues) {
   for (auto p : params) {
-    emitTensorView(ss, p, paramValues);
+    emitTensorView(ss, p, paramValues, true);
   }
 }
 
diff --git a/test/test_mapper.cc b/test/test_mapper.cc
@@ -182,8 +182,8 @@ def fun(float(N, M) A, float(N, M) B) -> (C) {
       R"RES(int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
   int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
   float32 (*C)[M] = reinterpret_cast<float32 (*)[M]>(pC);
-  float32 (*A)[M] = reinterpret_cast<float32 (*)[M]>(pA);
-  float32 (*B)[M] = reinterpret_cast<float32 (*)[M]>(pB);
+  const float32 (*A)[M] = reinterpret_cast<const float32 (*)[M]>(pA);
+  const float32 (*B)[M] = reinterpret_cast<const float32 (*)[M]>(pB);
   for (int c1 = 16 * b1; c1 < M; c1 += 4096) {
     if (M >= t1 + c1 + 1) {
       C[(t0 + 16 * b0)][(t1 + c1)] = (A[(t0 + 16 * b0)][(t1 + c1)] + B[(t0 + 16 * b0)][(t1 + c1)]);
@@ -219,10 +219,10 @@ def fun(float(N, N, N, N) A, float(N, N) B, float(N, N) C, float(N, N) D)
   float32 (*O1)[N] = reinterpret_cast<float32 (*)[N]>(pO1);
   float32 (*O2)[N] = reinterpret_cast<float32 (*)[N]>(pO2);
   float32 (*O3)[N] = reinterpret_cast<float32 (*)[N]>(pO3);
-  float32 (*A)[N][N][N] = reinterpret_cast<float32 (*)[N][N][N]>(pA);
-  float32 (*B)[N] = reinterpret_cast<float32 (*)[N]>(pB);
-  float32 (*C)[N] = reinterpret_cast<float32 (*)[N]>(pC);
-  float32 (*D)[N] = reinterpret_cast<float32 (*)[N]>(pD);
+  const float32 (*A)[N][N][N] = reinterpret_cast<const float32 (*)[N][N][N]>(pA);
+  const float32 (*B)[N] = reinterpret_cast<const float32 (*)[N]>(pB);
+  const float32 (*C)[N] = reinterpret_cast<const float32 (*)[N]>(pC);
+  const float32 (*D)[N] = reinterpret_cast<const float32 (*)[N]>(pD);
   for (int c0 = 0; c0 < N; c0 += 1) {
     for (int c1 = 0; c1 < N; c1 += 1) {
       O1[c0][c1] = 0.000000f;
@@ -261,11 +261,11 @@ def fun(float(N, N) A) -> (O)
   auto res = std::get<0>(mscop->codegen(specializedName));
 
   string expected(
-      R"RES(__global__ void kernel_anon(int32 N, float32* pO, float32* pA) {
+      R"RES(__global__ void kernel_anon(int32 N, float32* pO, const float32* pA) {
   int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
   int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
   float32 (*O)[N] = reinterpret_cast<float32 (*)[N]>(pO);
-  float32 (*A)[N] = reinterpret_cast<float32 (*)[N]>(pA);
+  const float32 (*A)[N] = reinterpret_cast<const float32 (*)[N]>(pA);
   for (int c0 = 0; c0 < N; c0 += 1) {
     for (int c1 = 0; c1 < N; c1 += 1) {
       O[c0][c1] = (((A[c0][c1] + float32(c0)) + float32(c1)) + float32(N));
@@ -290,13 +290,13 @@ def fun(float(N, N) A, float(N, N) B, float(N) C) -> (O)
   auto res = std::get<0>(mscop->codegen(specializedName));
 
   string expected =
-      R"RES(__global__ void kernel_anon(int32 N, float32* pO, float32* pA, float32* pB, float32* pC) {
+      R"RES(__global__ void kernel_anon(int32 N, float32* pO, const float32* pA, const float32* pB, const float32* pC) {
   int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
   int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
   float32 (*O)[512] = reinterpret_cast<float32 (*)[512]>(pO);
-  float32 (*A)[512] = reinterpret_cast<float32 (*)[512]>(pA);
-  float32 (*B)[512] = reinterpret_cast<float32 (*)[512]>(pB);
-  float32 (*C) = reinterpret_cast<float32 (*)>(pC);
+  const float32 (*A)[512] = reinterpret_cast<const float32 (*)[512]>(pA);
+  const float32 (*B)[512] = reinterpret_cast<const float32 (*)[512]>(pB);
+  const float32 (*C) = reinterpret_cast<const float32 (*)>(pC);
   for (int c0 = 0; c0 <= 511; c0 += 1) {
     for (int c1 = 0; c1 <= 511; c1 += 1) {
       O[c0][c1] = (nextafter(C[c0], exp(A[c0][c1])) + log(B[c1][c0]));
@@ -312,8 +312,8 @@ constexpr auto kExpectedMatmul_64_64_64 =
     R"CUDA(int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
   int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
   float32 (*O)[64] = reinterpret_cast<float32 (*)[64]>(pO);
-  float32 (*A)[64] = reinterpret_cast<float32 (*)[64]>(pA);
-  float32 (*B)[64] = reinterpret_cast<float32 (*)[64]>(pB);
+  const float32 (*A)[64] = reinterpret_cast<const float32 (*)[64]>(pA);
+  const float32 (*B)[64] = reinterpret_cast<const float32 (*)[64]>(pB);
   for (int c0 = 0; c0 <= 63; c0 += 16) {
     for (int c1 = 0; c1 <= 63; c1 += 16) {
       for (int c2 = t1; c2 <= 15; c2 += 8) {
diff --git a/test/test_tc2halide.cc b/test/test_tc2halide.cc
@@ -197,6 +197,15 @@ def fun(float(M, N) I) -> (O1, O2, O3) {
   Check(tc, {123, 13});
 }
 
+TEST_F(TC2Isl, MutableInput) {
+  string tc = R"TC(
+def foo(float(N) A) -> (B) {
+    A(i) = A(i) + 42
+    B(k) +=! A(i) where k in 0:1
+}
+)TC";
+  EXPECT_THROW(Check(tc, {123}), ::lang::ErrorReport);
+}
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
   ::gflags::ParseCommandLineFlags(&argc, &argv, true);