@@ -50,6 +50,8 @@ class TcOp : public Operator<Context> {
50
50
gradTcName_ =
51
51
OperatorBase::GetSingleArgument<std::string>(" tcGradName" , " ERROR" );
52
52
profile_ = OperatorBase::GetSingleArgument<bool >(" profile" , false );
53
+ compiled_ = false ;
54
+ handle_ = 0 ;
53
55
ArgumentHelper args (operator_def);
54
56
if (args.HasArgument (" mappingOptions" )) {
55
57
cudaMappingOptions_ = tc::CudaMappingOptions (
@@ -95,38 +97,38 @@ class TcOp : public Operator<Context> {
95
97
}
96
98
97
99
virtual bool RunOnDevice () override {
98
- // first, given the TC, define it in the executionEngine_
99
- executionEngine_->define (tc_);
100
-
101
- // now, given the input tensors, convert them to dlpack tensors so that
102
- // we can call the compile command
103
- std::vector<::tc::dlutils::DLTensorUPtr> inTensorUPtrs;
104
- std::vector<const DLTensor*> inputDLTensors;
105
- for (int idx = 0 ; idx < this ->InputSize (); ++idx) {
106
- auto dims = this ->Input (idx).dims ();
107
- inTensorUPtrs.emplace_back (
108
- dlpack::makeConstDLTensor (this ->Input (idx), dims));
109
- inputDLTensors.push_back (inTensorUPtrs.back ().get ());
100
+ if (!compiled_) {
101
+ // first, given the TC, define it in the executionEngine_
102
+ executionEngine_->define (tc_);
103
+ for (int idx = 0 ; idx < this ->InputSize (); ++idx) {
104
+ auto dims = this ->Input (idx).dims ();
105
+ inTensorUPtrs_.emplace_back (
106
+ dlpack::makeConstDLTensor (this ->Input (idx), dims));
107
+ inputDLTensors_.push_back (inTensorUPtrs_[idx].get ());
108
+ inputVoidPtrs_.push_back (inputDLTensors_[idx]->data );
109
+ }
110
+ auto outTensorInfo =
111
+ executionEngine_->inferOutputTensorInfo (tcName_, inputDLTensors_);
112
+ prepareOutputs (outTensorInfo);
113
+ for (int idx = 0 ; idx < OutputSize (); ++idx) {
114
+ outTensorUPtrs_.emplace_back (dlpack::makeDLTensor (Output (idx)));
115
+ outputDLTensors_.push_back (outTensorUPtrs_[idx].get ());
116
+ outputVoidPtrs_.push_back (outputDLTensors_[idx]->data );
117
+ }
118
+ handle_ = executionEngine_->compile (
119
+ tcName_,
120
+ inputDLTensors_,
121
+ cudaMappingOptions_.toProtobufSerializedString ());
122
+ compiled_ = true ;
110
123
}
111
124
112
- auto outTensorInfo =
113
- executionEngine_->inferOutputTensorInfo (tcName_, inputDLTensors);
114
- prepareOutputs (outTensorInfo);
115
-
116
- // now create the outputDLTensors
117
- std::vector<::tc::dlutils::DLTensorUPtr> outTensorUPtrs;
118
- std::vector<DLTensor*> outputDLTensors;
119
- for (int i = 0 ; i < OutputSize (); ++i) {
120
- outTensorUPtrs.emplace_back (dlpack::makeDLTensor (Output (i)));
121
- outputDLTensors.push_back (outTensorUPtrs.back ().get ());
125
+ if (profile_) {
126
+ executionEngine_->run (
127
+ handle_, inputDLTensors_, outputDLTensors_, profile_);
128
+ } else {
129
+ executionEngine_->uncheckedRun (handle_, inputVoidPtrs_, outputVoidPtrs_);
122
130
}
123
131
124
- // compile and run
125
- auto handle = executionEngine_->compile (
126
- tcName_,
127
- inputDLTensors,
128
- cudaMappingOptions_.toProtobufSerializedString ());
129
- executionEngine_->run (handle, inputDLTensors, outputDLTensors, profile_);
130
132
return true ;
131
133
}
132
134
@@ -136,6 +138,14 @@ class TcOp : public Operator<Context> {
136
138
std::string tcName_;
137
139
std::string gradTcName_;
138
140
bool profile_;
141
+ bool compiled_;
142
+ size_t handle_;
143
+ std::vector<const void *> inputVoidPtrs_;
144
+ std::vector<void *> outputVoidPtrs_;
145
+ std::vector<const DLTensor*> inputDLTensors_;
146
+ std::vector<DLTensor*> outputDLTensors_;
147
+ std::vector<::tc::dlutils::DLTensorUPtr> inTensorUPtrs_;
148
+ std::vector<::tc::dlutils::DLTensorUPtr> outTensorUPtrs_;
139
149
tc::CudaMappingOptions cudaMappingOptions_;
140
150
tc::CudaMappingOptions gradCudaMappingOptions_;
141
151
0 commit comments