Add support for strided tensors

Protonu Basu · Protonu Basu · commit 9cd8fc9a1362 · 2018-06-08T07:56:05.000-07:00
This commit is to start support for strided tensors. I made changes
to percolate a vector in TensorInfo down to emitCudaKernel to allow
codegen to cast strided tensors. This required changes to an unit test
to expect the correct cast.
diff --git a/tc/core/cuda/cuda_tc_executor.cc b/tc/core/cuda/cuda_tc_executor.cc
@@ -93,13 +93,16 @@ CudaCompilationResult CudaBackend::compileWithTcMapper(
   auto parameters = mappedScop->scop().getParameterValues();
   auto specializedName = specializeKernelName(tcName, parameters);
 
+  auto inputsInfo = makeTensorInfoVector(inputs);
+
   // This updates the launch bounds with the actual result from compilation
   // with tightening of launch_bounds. What you get is not necessarily what
   // you asked for, the autotuner should adapt to that.
   std::string source;
   Grid grid;
   Block block;
-  std::tie(source, grid, block) = mappedScop->codegen(specializedName);
+  std::tie(source, grid, block) =
+      mappedScop->codegen(specializedName, inputsInfo);
   LOG_IF(INFO, FLAGS_dump_cuda) << "generatedCuda: " << source << "\n"
                                 << "grid: " << grid << " block: " << block;
 
diff --git a/tc/core/polyhedral/cuda/codegen.cc b/tc/core/polyhedral/cuda/codegen.cc
@@ -183,15 +183,23 @@ void emitTensorView(
     stringstream& ss,
     Halide::OutputImageParam p,
     const map<string, Halide::Expr>& paramValues,
-    bool constInput = false) {
+    bool constInput = false,
+    const TensorInfo* tinfo = NULL) {
   WS ws;
   stringstream ssViewType;
   for (int i = 1; i < p.dimensions(); ++i) { // Skip the outermost dimension
     Halide::Expr extent = p.parameter().extent_constraint(i);
     extent = Halide::Internal::substitute(paramValues, extent);
     CHECK(extent.defined())
         << "Undefined extent on input/output tensor. Forward bounds inference should have set these\n";
-    ssViewType << "[" << extent << "]";
+    // TODO: Handle non-unit stride in the innermost dimension
+    if (tinfo && tinfo->strides.size() == p.dimensions() &&
+        tinfo->strides[p.dimensions() - 1] == 1 &&
+        tinfo->strides[i - 1] != (tinfo->shape[i] * tinfo->strides[i])) {
+      ssViewType << "[" << tinfo->strides[i - 1] << "]";
+    } else {
+      ssViewType << "[" << extent << "]";
+    }
   }
   ss << ws.tab();
   ss << (constInput ? "const " : "") << p.type() << " (*" << p.name() << ")"
@@ -216,9 +224,12 @@ void emitTensorViews(
 void emitTensorViews(
     stringstream& ss,
     const vector<Halide::ImageParam>& params,
-    const map<string, Halide::Expr>& paramValues) {
-  for (auto p : params) {
-    emitTensorView(ss, p, paramValues, true);
+    const map<string, Halide::Expr>& paramValues,
+    const std::vector<TensorInfo>& inputsInfo) {
+  for (size_t i = 0; i < params.size(); ++i) {
+    inputsInfo.size()
+        ? emitTensorView(ss, params[i], paramValues, true, &inputsInfo[i])
+        : emitTensorView(ss, params[i], paramValues, true);
   }
 }
 
@@ -738,7 +749,8 @@ std::unordered_set<isl::id, isl::IslIdIslHash> gatherReadOnlySet(
 
 string emitCudaKernel(
     const std::string& specializedName,
-    const MappedScop& mscop) {
+    const MappedScop& mscop,
+    const std::vector<TensorInfo>& inputsInfo) {
   // Expecting a schedule with domain root and context first child.
   CHECK(mscop.schedule()->elemAs<detail::ScheduleTreeElemDomain>());
   CHECK(
@@ -755,7 +767,7 @@ string emitCudaKernel(
   emitKernelSignature(ss, specializedName, scop);
   emitThreadIdInit(ss, mscop);
   emitTensorViews(ss, scop.halide.outputs, paramValues);
-  emitTensorViews(ss, scop.halide.inputs, paramValues);
+  emitTensorViews(ss, scop.halide.inputs, paramValues, inputsInfo);
   emitTmpDecl(ss, scop);
   emitPromotedArrayViewsHalide(ss, scop);
   NodeInfoMapType nodeInfoMap;
diff --git a/tc/core/polyhedral/cuda/codegen.h b/tc/core/polyhedral/cuda/codegen.h
@@ -145,7 +145,8 @@ struct CodegenStatementContext : CodegenContext {
 
 std::string emitCudaKernel(
     const std::string& specializedName,
-    const MappedScop& scop);
+    const MappedScop& scop,
+    const std::vector<TensorInfo>& inputsInfo);
 
 } // namespace polyhedral
 } // namespace tc
diff --git a/tc/core/polyhedral/cuda/mapped_scop.cc b/tc/core/polyhedral/cuda/mapped_scop.cc
@@ -910,7 +910,8 @@ std::unique_ptr<MappedScop> makeSpecializedMappedScop(
 // the context of the original scop as top-level
 // context node in schedule tree.
 std::tuple<std::string, tc::Grid, tc::Block> MappedScop::codegen(
-    const std::string& specializedName) const {
+    const std::string& specializedName,
+    const std::vector<TensorInfo>& inputsInfo) const {
   validate(schedule());
 
   auto mappedScopForCodegen = makeSpecializedMappedScop(*this);
@@ -927,8 +928,8 @@ std::tuple<std::string, tc::Grid, tc::Block> MappedScop::codegen(
     code << code::cuda::cubBlockReduce;
   }
   code << "extern \"C\" {" << std::endl
-       << emitCudaKernel(specializedName, *mappedScopForCodegen) << "}"
-       << std::endl;
+       << emitCudaKernel(specializedName, *mappedScopForCodegen, inputsInfo)
+       << "}" << std::endl;
 
   return std::make_tuple(
       code.str(),
diff --git a/tc/core/polyhedral/cuda/mapped_scop.h b/tc/core/polyhedral/cuda/mapped_scop.h
@@ -115,7 +115,9 @@ class MappedScop {
   // Generate CUDA code at the current state of transformation provided a
   // name for the generated function.
   std::tuple<std::string, tc::Grid, tc::Block> codegen(
-      const std::string& specializedName) const;
+      const std::string& specializedName,
+      const std::vector<TensorInfo>& inputsInfo =
+          std::vector<TensorInfo>{}) const;
 
   // Accessors..
   // Const accessor to schedule of underlying Scop.
diff --git a/test/cuda/test_tc_mapper.cc b/test/cuda/test_tc_mapper.cc
@@ -326,8 +326,8 @@ def tensoraddstrided(float(N, M) I0_view, float(N, M) I1_view) -> (O) {
   auto res = Check(TC, name, options, inputs, checkFun);
   // This test should be modified  when strided tensors are handled
   std::string expected =
-      "const float32 (*I0_view)[64] = "
-      "reinterpret_cast<const float32 (*)[64]>(pI0_view)";
+      "const float32 (*I0_view)[128] = "
+      "reinterpret_cast<const float32 (*)[128]>(pI0_view)";
   ASSERT_NE(std::string::npos, res.second.find(expected))
       << "In resulting code:\n"
       << res.second << "\nfound unexpected: " << expected;