Merge pull request #429 from mingzhe09088/add-param

nicolasvasilache · web-flow · commit 87caef11ff7c · 2018-06-01T15:20:02.000-04:00
Add argument RuntimeInformation to uncheckedRun and run, so CUDA stre…
diff --git a/tc/c2/tc_op.h b/tc/c2/tc_op.h
@@ -119,11 +119,15 @@ class TcOp : public Operator<Context> {
       compiled_ = true;
     }
 
+    // Get CUDA stream id from C2
+    tc::CudaBackend::RuntimeInformation info;
+    info.stream = context_.cuda_stream();
+
     // run
     if (!check_sizes_) {
-      executor_->uncheckedRun(input_void_ptrs_, output_void_ptrs_);
+      executor_->uncheckedRun(input_void_ptrs_, output_void_ptrs_, info);
     } else {
-      executor_->run(raw_input_dl_tensors_, raw_output_dl_tensors_);
+      executor_->run(raw_input_dl_tensors_, raw_output_dl_tensors_, info);
     }
     return true;
   }
diff --git a/tc/core/cpu/cpu_tc_executor.cc b/tc/core/cpu/cpu_tc_executor.cc
@@ -52,7 +52,8 @@ CpuCompilationResult CpuBackend::compileWithTcMapper(
 
 void CpuTcExecutor::uncheckedRun(
     const std::vector<const void*>& inputs,
-    const std::vector<void*>& outputs) const {
+    const std::vector<void*>& outputs,
+    typename CpuBackend::RuntimeInformation info) const {
   LOG(ERROR) << "NYI: CpuTcExecutor::uncheckedRun";
 }
 
diff --git a/tc/core/cpu/cpu_tc_executor.h b/tc/core/cpu/cpu_tc_executor.h
@@ -39,7 +39,8 @@ class CpuTcExecutor : public TcExecutor<CpuBackend> {
   /// doesn't then segfault will likely occur.
   void uncheckedRun(
       const std::vector<const void*>& inputs,
-      const std::vector<void*>& outputs) const;
+      const std::vector<void*>& outputs,
+      typename CpuBackend::RuntimeInformation info) const;
 
   /// Calls uncheckedRun and profiles the cpu overhead and kernel runtime
   /// (microseconds).
diff --git a/tc/core/cuda/cuda_backend.h b/tc/core/cuda/cuda_backend.h
@@ -44,7 +44,11 @@ struct CudaCompilationResult {
  * synchronization information of a kernel.
  */
 struct CudaRuntimeInformation {
-  cudaStream_t stream{0};
+ public:
+  CudaRuntimeInformation() : stream(0) {}
+
+ public:
+  cudaStream_t stream;
 };
 
 struct CudaTcExecutor;
diff --git a/tc/core/cuda/cuda_tc_executor.cc b/tc/core/cuda/cuda_tc_executor.cc
@@ -111,16 +111,16 @@ CudaCompilationResult CudaBackend::compileWithTcMapper(
 
 void CudaTcExecutor::uncheckedRun(
     const std::vector<const void*>& inputs,
-    const std::vector<void*>& outputs) const {
+    const std::vector<void*>& outputs,
+    typename CudaBackend::RuntimeInformation info) const {
   CHECK(rtcFun_) << "No rtcFun_ attached, cannot launch";
-  cudaStream_t stream = 0;
   CHECK_NE(grid_.view[0], 0u) << "Grid dims are not set up";
   CHECK_NE(block_.view[0], 0u) << "Block dims are not set up";
   rtcFun_->Launch(
       grid_.view.extractDefaultedArray(),
       block_.view.extractDefaultedArray(),
       0,
-      stream,
+      info.stream,
       parameters_,
       outputs,
       inputs);
diff --git a/tc/core/cuda/cuda_tc_executor.h b/tc/core/cuda/cuda_tc_executor.h
@@ -37,9 +37,12 @@ class CudaTcExecutor : public TcExecutor<CudaBackend> {
   /// No tensor-related information can be checked so it is the user's
   /// responsibility to ensure that shapes and strides match. If the user
   /// doesn't then segfault will likely occur.
+  /// May need to reconsider whether use pass by value or by reference
   void uncheckedRun(
       const std::vector<const void*>& inputs,
-      const std::vector<void*>& outputs) const;
+      const std::vector<void*>& outputs,
+      typename CudaBackend::RuntimeInformation info =
+          CudaBackend::RuntimeInformation()) const;
 
   /// Calls uncheckedRun and profiles the cpu overhead and kernel runtime
   /// (microseconds).
diff --git a/tc/core/tc_executor-inl.h b/tc/core/tc_executor-inl.h
@@ -109,15 +109,16 @@ inline std::pair<std::vector<const void*>, std::vector<void*>> prepareRun(
 template <typename Backend>
 void TcExecutor<Backend>::run(
     const std::vector<const DLConstTensor*>& inputs,
-    const std::vector<const DLTensor*>& outputs) const {
+    const std::vector<const DLTensor*>& outputs,
+    typename Backend::RuntimeInformation info) const {
   std::vector<const void*> rawInputs;
   std::vector<void*> rawOutputs;
   std::tie(rawInputs, rawOutputs) = detail::prepareRun(
       inputs, outputs, inputsInfo_, outputsInfo_, halideComponents_);
 
   // Static dispatch instead of virtual functions requires this cast.
   static_cast<const typename Backend::ExecutorType&>(*this).uncheckedRun(
-      rawInputs, rawOutputs);
+      rawInputs, rawOutputs, info);
 }
 
 template <typename Backend>
diff --git a/tc/core/tc_executor.h b/tc/core/tc_executor.h
@@ -88,7 +88,8 @@ class TcExecutor {
   /// advanced aliasing) properties of the input and output tensors.
   void run(
       const std::vector<const DLConstTensor*>& inputs,
-      const std::vector<const DLTensor*>& outputs) const;
+      const std::vector<const DLTensor*>& outputs,
+      typename Backend::RuntimeInformation info = {}) const;
 
   /// Calls run and profiles the cpu overhead and kernel runtime (microseconds).
   /// \returns profiling information

Original file line number	Diff line number	Diff line change
`@@ -119,11 +119,15 @@ class TcOp : public Operator<Context> {`
`119`	`119`	`compiled_ = true;`
`120`	`120`	`}`
`121`	`121`
	`122`	`+ // Get CUDA stream id from C2`
	`123`	`+ tc::CudaBackend::RuntimeInformation info;`
	`124`	`+ info.stream = context_.cuda_stream();`
	`125`	`+`
`122`	`126`	`// run`
`123`	`127`	`if (!check_sizes_) {`
`124`		`- executor_->uncheckedRun(input_void_ptrs_, output_void_ptrs_);`
	`128`	`+ executor_->uncheckedRun(input_void_ptrs_, output_void_ptrs_, info);`
`125`	`129`	`} else {`
`126`		`- executor_->run(raw_input_dl_tensors_, raw_output_dl_tensors_);`
	`130`	`+ executor_->run(raw_input_dl_tensors_, raw_output_dl_tensors_, info);`
`127`	`131`	`}`
`128`	`132`	`return true;`
`129`	`133`	`}`
Original file line number	Diff line number	Diff line change
`@@ -52,7 +52,8 @@ CpuCompilationResult CpuBackend::compileWithTcMapper(`
`52`	`52`
`53`	`53`	`void CpuTcExecutor::uncheckedRun(`
`54`	`54`	`const std::vector<const void*>& inputs,`
`55`		`- const std::vector<void*>& outputs) const {`
	`55`	`+ const std::vector<void*>& outputs,`
	`56`	`+ typename CpuBackend::RuntimeInformation info) const {`
`56`	`57`	`LOG(ERROR) << "NYI: CpuTcExecutor::uncheckedRun";`
`57`	`58`	`}`
`58`	`59`