diff --git a/tc/core/cuda/cuda_tc_executor.cc b/tc/core/cuda/cuda_tc_executor.cc index 72a1350ad..1ebb2047b 100644 --- a/tc/core/cuda/cuda_tc_executor.cc +++ b/tc/core/cuda/cuda_tc_executor.cc @@ -93,13 +93,16 @@ CudaCompilationResult CudaBackend::compileWithTcMapper( auto parameters = mappedScop->scop().getParameterValues(); auto specializedName = specializeKernelName(tcName, parameters); + auto inputsInfo = makeTensorInfoVector(inputs); + // This updates the launch bounds with the actual result from compilation // with tightening of launch_bounds. What you get is not necessarily what // you asked for, the autotuner should adapt to that. std::string source; Grid grid; Block block; - std::tie(source, grid, block) = mappedScop->codegen(specializedName); + std::tie(source, grid, block) = + mappedScop->codegen(specializedName, inputsInfo); LOG_IF(INFO, FLAGS_dump_cuda) << "generatedCuda: " << source << "\n" << "grid: " << grid << " block: " << block; diff --git a/tc/core/polyhedral/cuda/codegen.cc b/tc/core/polyhedral/cuda/codegen.cc index ee1643984..71a9afe3b 100644 --- a/tc/core/polyhedral/cuda/codegen.cc +++ b/tc/core/polyhedral/cuda/codegen.cc @@ -183,7 +183,8 @@ void emitTensorView( stringstream& ss, Halide::OutputImageParam p, const map& paramValues, - bool constInput = false) { + bool constInput = false, + const TensorInfo* tinfo = NULL) { WS ws; stringstream ssViewType; for (int i = 1; i < p.dimensions(); ++i) { // Skip the outermost dimension @@ -191,7 +192,14 @@ void emitTensorView( extent = Halide::Internal::substitute(paramValues, extent); CHECK(extent.defined()) << "Undefined extent on input/output tensor. Forward bounds inference should have set these\n"; - ssViewType << "[" << extent << "]"; + // TODO: Handle non-unit stride in the innermost dimension + if (tinfo && tinfo->strides.size() == p.dimensions() && + tinfo->strides[p.dimensions() - 1] == 1 && + tinfo->strides[i - 1] != (tinfo->shape[i] * tinfo->strides[i])) { + ssViewType << "[" << tinfo->strides[i - 1] << "]"; + } else { + ssViewType << "[" << extent << "]"; + } } ss << ws.tab(); ss << (constInput ? "const " : "") << p.type() << " (*" << p.name() << ")" @@ -216,9 +224,12 @@ void emitTensorViews( void emitTensorViews( stringstream& ss, const vector& params, - const map& paramValues) { - for (auto p : params) { - emitTensorView(ss, p, paramValues, true); + const map& paramValues, + const std::vector& inputsInfo) { + for (size_t i = 0; i < params.size(); ++i) { + inputsInfo.size() + ? emitTensorView(ss, params[i], paramValues, true, &inputsInfo[i]) + : emitTensorView(ss, params[i], paramValues, true); } } @@ -738,7 +749,8 @@ std::unordered_set gatherReadOnlySet( string emitCudaKernel( const std::string& specializedName, - const MappedScop& mscop) { + const MappedScop& mscop, + const std::vector& inputsInfo) { // Expecting a schedule with domain root and context first child. CHECK(mscop.schedule()->elemAs()); CHECK( @@ -755,7 +767,7 @@ string emitCudaKernel( emitKernelSignature(ss, specializedName, scop); emitThreadIdInit(ss, mscop); emitTensorViews(ss, scop.halide.outputs, paramValues); - emitTensorViews(ss, scop.halide.inputs, paramValues); + emitTensorViews(ss, scop.halide.inputs, paramValues, inputsInfo); emitTmpDecl(ss, scop); emitPromotedArrayViewsHalide(ss, scop); NodeInfoMapType nodeInfoMap; diff --git a/tc/core/polyhedral/cuda/codegen.h b/tc/core/polyhedral/cuda/codegen.h index ff3631d92..e4b11595c 100644 --- a/tc/core/polyhedral/cuda/codegen.h +++ b/tc/core/polyhedral/cuda/codegen.h @@ -145,7 +145,8 @@ struct CodegenStatementContext : CodegenContext { std::string emitCudaKernel( const std::string& specializedName, - const MappedScop& scop); + const MappedScop& scop, + const std::vector& inputsInfo); } // namespace polyhedral } // namespace tc diff --git a/tc/core/polyhedral/cuda/mapped_scop.cc b/tc/core/polyhedral/cuda/mapped_scop.cc index e0dc474ae..1efb03c0b 100644 --- a/tc/core/polyhedral/cuda/mapped_scop.cc +++ b/tc/core/polyhedral/cuda/mapped_scop.cc @@ -910,7 +910,8 @@ std::unique_ptr makeSpecializedMappedScop( // the context of the original scop as top-level // context node in schedule tree. std::tuple MappedScop::codegen( - const std::string& specializedName) const { + const std::string& specializedName, + const std::vector& inputsInfo) const { validate(schedule()); auto mappedScopForCodegen = makeSpecializedMappedScop(*this); @@ -927,8 +928,8 @@ std::tuple MappedScop::codegen( code << code::cuda::cubBlockReduce; } code << "extern \"C\" {" << std::endl - << emitCudaKernel(specializedName, *mappedScopForCodegen) << "}" - << std::endl; + << emitCudaKernel(specializedName, *mappedScopForCodegen, inputsInfo) + << "}" << std::endl; return std::make_tuple( code.str(), diff --git a/tc/core/polyhedral/cuda/mapped_scop.h b/tc/core/polyhedral/cuda/mapped_scop.h index 169b4f138..5af792df9 100644 --- a/tc/core/polyhedral/cuda/mapped_scop.h +++ b/tc/core/polyhedral/cuda/mapped_scop.h @@ -115,7 +115,9 @@ class MappedScop { // Generate CUDA code at the current state of transformation provided a // name for the generated function. std::tuple codegen( - const std::string& specializedName) const; + const std::string& specializedName, + const std::vector& inputsInfo = + std::vector{}) const; // Accessors.. // Const accessor to schedule of underlying Scop. diff --git a/test/cuda/test_tc_mapper.cc b/test/cuda/test_tc_mapper.cc index e89756aea..3aedafa29 100644 --- a/test/cuda/test_tc_mapper.cc +++ b/test/cuda/test_tc_mapper.cc @@ -326,8 +326,8 @@ def tensoraddstrided(float(N, M) I0_view, float(N, M) I1_view) -> (O) { auto res = Check(TC, name, options, inputs, checkFun); // This test should be modified when strided tensors are handled std::string expected = - "const float32 (*I0_view)[64] = " - "reinterpret_cast(pI0_view)"; + "const float32 (*I0_view)[128] = " + "reinterpret_cast(pI0_view)"; ASSERT_NE(std::string::npos, res.second.find(expected)) << "In resulting code:\n" << res.second << "\nfound unexpected: " << expected;