Skip to content
This repository was archived by the owner on Apr 28, 2023. It is now read-only.

Commit 403ce94

Browse files
Nicolas Vasilachenicolasvasilache
authored andcommitted
Remove C2 overhead
This PR introduces memoization for compilation and data pointers in tc_op.h This is temporary since the bigger API refactoring will change things here but at least it will allow @salexpb to run benchmarks without ridiculous CPU overhead
1 parent df36d16 commit 403ce94

File tree

1 file changed

+38
-28
lines changed

1 file changed

+38
-28
lines changed

tc/c2/tc_op.h

Lines changed: 38 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,8 @@ class TcOp : public Operator<Context> {
5050
gradTcName_ =
5151
OperatorBase::GetSingleArgument<std::string>("tcGradName", "ERROR");
5252
profile_ = OperatorBase::GetSingleArgument<bool>("profile", false);
53+
compiled_ = false;
54+
handle_ = 0;
5355
ArgumentHelper args(operator_def);
5456
if (args.HasArgument("mappingOptions")) {
5557
cudaMappingOptions_ = tc::CudaMappingOptions(
@@ -95,38 +97,38 @@ class TcOp : public Operator<Context> {
9597
}
9698

9799
virtual bool RunOnDevice() override {
98-
// first, given the TC, define it in the executionEngine_
99-
executionEngine_->define(tc_);
100-
101-
// now, given the input tensors, convert them to dlpack tensors so that
102-
// we can call the compile command
103-
std::vector<::tc::dlutils::DLTensorUPtr> inTensorUPtrs;
104-
std::vector<const DLTensor*> inputDLTensors;
105-
for (int idx = 0; idx < this->InputSize(); ++idx) {
106-
auto dims = this->Input(idx).dims();
107-
inTensorUPtrs.emplace_back(
108-
dlpack::makeConstDLTensor(this->Input(idx), dims));
109-
inputDLTensors.push_back(inTensorUPtrs.back().get());
100+
if (!compiled_) {
101+
// first, given the TC, define it in the executionEngine_
102+
executionEngine_->define(tc_);
103+
for (int idx = 0; idx < this->InputSize(); ++idx) {
104+
auto dims = this->Input(idx).dims();
105+
inTensorUPtrs_.emplace_back(
106+
dlpack::makeConstDLTensor(this->Input(idx), dims));
107+
inputDLTensors_.push_back(inTensorUPtrs_[idx].get());
108+
inputVoidPtrs_.push_back(inputDLTensors_[idx]->data);
109+
}
110+
auto outTensorInfo =
111+
executionEngine_->inferOutputTensorInfo(tcName_, inputDLTensors_);
112+
prepareOutputs(outTensorInfo);
113+
for (int idx = 0; idx < OutputSize(); ++idx) {
114+
outTensorUPtrs_.emplace_back(dlpack::makeDLTensor(Output(idx)));
115+
outputDLTensors_.push_back(outTensorUPtrs_[idx].get());
116+
outputVoidPtrs_.push_back(outputDLTensors_[idx]->data);
117+
}
118+
handle_ = executionEngine_->compile(
119+
tcName_,
120+
inputDLTensors_,
121+
cudaMappingOptions_.toProtobufSerializedString());
122+
compiled_ = true;
110123
}
111124

112-
auto outTensorInfo =
113-
executionEngine_->inferOutputTensorInfo(tcName_, inputDLTensors);
114-
prepareOutputs(outTensorInfo);
115-
116-
// now create the outputDLTensors
117-
std::vector<::tc::dlutils::DLTensorUPtr> outTensorUPtrs;
118-
std::vector<DLTensor*> outputDLTensors;
119-
for (int i = 0; i < OutputSize(); ++i) {
120-
outTensorUPtrs.emplace_back(dlpack::makeDLTensor(Output(i)));
121-
outputDLTensors.push_back(outTensorUPtrs.back().get());
125+
if (profile_) {
126+
executionEngine_->run(
127+
handle_, inputDLTensors_, outputDLTensors_, profile_);
128+
} else {
129+
executionEngine_->uncheckedRun(handle_, inputVoidPtrs_, outputVoidPtrs_);
122130
}
123131

124-
// compile and run
125-
auto handle = executionEngine_->compile(
126-
tcName_,
127-
inputDLTensors,
128-
cudaMappingOptions_.toProtobufSerializedString());
129-
executionEngine_->run(handle, inputDLTensors, outputDLTensors, profile_);
130132
return true;
131133
}
132134

@@ -136,6 +138,14 @@ class TcOp : public Operator<Context> {
136138
std::string tcName_;
137139
std::string gradTcName_;
138140
bool profile_;
141+
bool compiled_;
142+
size_t handle_;
143+
std::vector<const void*> inputVoidPtrs_;
144+
std::vector<void*> outputVoidPtrs_;
145+
std::vector<const DLTensor*> inputDLTensors_;
146+
std::vector<DLTensor*> outputDLTensors_;
147+
std::vector<::tc::dlutils::DLTensorUPtr> inTensorUPtrs_;
148+
std::vector<::tc::dlutils::DLTensorUPtr> outTensorUPtrs_;
139149
tc::CudaMappingOptions cudaMappingOptions_;
140150
tc::CudaMappingOptions gradCudaMappingOptions_;
141151

0 commit comments

Comments
 (0)