Skip to content

Commit 42f1888

Browse files
authored
[Serving][Backend] Backend support zero_copy_infer and Serving reduce the output memory copy (#703)
* backend add zero copy infer interface * fix bug * fix bug * fix bug * paddle ipu
1 parent edcf150 commit 42f1888

21 files changed

+254
-109
lines changed

fastdeploy/backends/backend.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,8 +62,11 @@ class BaseBackend {
6262
virtual TensorInfo GetOutputInfo(int index) = 0;
6363
virtual std::vector<TensorInfo> GetInputInfos() = 0;
6464
virtual std::vector<TensorInfo> GetOutputInfos() = 0;
65+
// if copy_to_fd is true, copy memory data to FDTensor
66+
// else share memory to FDTensor(only Paddle、ORT、TRT、OpenVINO support it)
6567
virtual bool Infer(std::vector<FDTensor>& inputs,
66-
std::vector<FDTensor>* outputs) = 0;
68+
std::vector<FDTensor>* outputs,
69+
bool copy_to_fd = true) = 0;
6770
virtual std::unique_ptr<BaseBackend> Clone(void *stream = nullptr,
6871
int device_id = -1) {
6972
FDERROR << "Clone no support" << std::endl;

fastdeploy/backends/lite/lite_backend.cc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,8 @@ TensorInfo LiteBackend::GetOutputInfo(int index) {
187187
std::vector<TensorInfo> LiteBackend::GetOutputInfos() { return outputs_desc_; }
188188

189189
bool LiteBackend::Infer(std::vector<FDTensor>& inputs,
190-
std::vector<FDTensor>* outputs) {
190+
std::vector<FDTensor>* outputs,
191+
bool copy_to_fd) {
191192
if (inputs.size() != inputs_desc_.size()) {
192193
FDERROR << "[LiteBackend] Size of inputs(" << inputs.size()
193194
<< ") should keep same with the inputs of this model("

fastdeploy/backends/lite/lite_backend.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,9 @@ class LiteBackend : public BaseBackend {
6060
const std::string& params_file,
6161
const LiteBackendOption& option = LiteBackendOption());
6262

63-
bool Infer(std::vector<FDTensor>& inputs, std::vector<FDTensor>* outputs) override; // NOLINT
63+
bool Infer(std::vector<FDTensor>& inputs,
64+
std::vector<FDTensor>* outputs,
65+
bool copy_to_fd = true) override; // NOLINT
6466

6567
int NumInputs() const override { return inputs_desc_.size(); }
6668

fastdeploy/backends/openvino/ov_backend.cc

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -341,7 +341,8 @@ int OpenVINOBackend::NumInputs() const { return input_infos_.size(); }
341341
int OpenVINOBackend::NumOutputs() const { return output_infos_.size(); }
342342

343343
bool OpenVINOBackend::Infer(std::vector<FDTensor>& inputs,
344-
std::vector<FDTensor>* outputs) {
344+
std::vector<FDTensor>* outputs,
345+
bool copy_to_fd) {
345346
if (inputs.size() != input_infos_.size()) {
346347
FDERROR << "[OpenVINOBackend] Size of the inputs(" << inputs.size()
347348
<< ") should keep same with the inputs of this model("
@@ -364,11 +365,20 @@ bool OpenVINOBackend::Infer(std::vector<FDTensor>& inputs,
364365
auto out_tensor_shape = out_tensor.get_shape();
365366
std::vector<int64_t> shape(out_tensor_shape.begin(),
366367
out_tensor_shape.end());
367-
(*outputs)[i].Allocate(shape,
368+
if(copy_to_fd) {
369+
(*outputs)[i].Resize(shape,
368370
OpenVINODataTypeToFD(out_tensor.get_element_type()),
369-
output_infos_[i].name);
370-
memcpy((*outputs)[i].MutableData(), out_tensor.data(),
371-
(*outputs)[i].Nbytes());
371+
output_infos_[i].name,
372+
Device::CPU);
373+
memcpy((*outputs)[i].MutableData(), out_tensor.data(),
374+
(*outputs)[i].Nbytes());
375+
} else {
376+
(*outputs)[i].name = output_infos_[i].name;
377+
(*outputs)[i].SetExternalData(shape,
378+
OpenVINODataTypeToFD(out_tensor.get_element_type()),
379+
out_tensor.data(),
380+
Device::CPU);
381+
}
372382
}
373383
return true;
374384
}

fastdeploy/backends/openvino/ov_backend.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,8 @@ class OpenVINOBackend : public BaseBackend {
4848
const OpenVINOBackendOption& option = OpenVINOBackendOption());
4949

5050
bool Infer(std::vector<FDTensor>& inputs,
51-
std::vector<FDTensor>* outputs) override;
51+
std::vector<FDTensor>* outputs,
52+
bool copy_to_fd = true) override;
5253

5354
int NumInputs() const override;
5455

fastdeploy/backends/ort/ort_backend.cc

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -181,8 +181,8 @@ bool OrtBackend::InitFromOnnx(const std::string& model_file,
181181
return true;
182182
}
183183

184-
void OrtBackend::CopyToCpu(const Ort::Value& value, FDTensor* tensor,
185-
const std::string& name) {
184+
void OrtBackend::OrtValueToFDTensor(const Ort::Value& value, FDTensor* tensor,
185+
const std::string& name, bool copy_to_fd) {
186186
const auto info = value.GetTensorTypeAndShapeInfo();
187187
const auto data_type = info.GetElementType();
188188
size_t numel = info.GetElementCount();
@@ -210,12 +210,21 @@ void OrtBackend::CopyToCpu(const Ort::Value& value, FDTensor* tensor,
210210
"Unrecognized data type of %d while calling OrtBackend::CopyToCpu().",
211211
data_type);
212212
}
213-
tensor->Resize(shape, dtype, name);
214-
memcpy(tensor->MutableData(), value.GetTensorData<void*>(), numel);
213+
const void* value_ptr = value.GetTensorData<void*>();
214+
if (copy_to_fd) {
215+
tensor->Resize(shape, dtype, name);
216+
memcpy(tensor->MutableData(), value_ptr, numel);
217+
} else {
218+
tensor->name = name;
219+
tensor->SetExternalData(
220+
shape, dtype,
221+
const_cast<void*>(value_ptr), Device::CPU);
222+
}
215223
}
216224

217225
bool OrtBackend::Infer(std::vector<FDTensor>& inputs,
218-
std::vector<FDTensor>* outputs) {
226+
std::vector<FDTensor>* outputs,
227+
bool copy_to_fd) {
219228
if (inputs.size() != inputs_desc_.size()) {
220229
FDERROR << "[OrtBackend] Size of the inputs(" << inputs.size()
221230
<< ") should keep same with the inputs of this model("
@@ -243,11 +252,12 @@ bool OrtBackend::Infer(std::vector<FDTensor>& inputs,
243252
return false;
244253
}
245254

246-
// Copy result after inference
255+
// Convert result after inference
247256
std::vector<Ort::Value> ort_outputs = binding_->GetOutputValues();
248257
outputs->resize(ort_outputs.size());
249258
for (size_t i = 0; i < ort_outputs.size(); ++i) {
250-
CopyToCpu(ort_outputs[i], &((*outputs)[i]), outputs_desc_[i].name);
259+
OrtValueToFDTensor(ort_outputs[i], &((*outputs)[i]),
260+
outputs_desc_[i].name, copy_to_fd);
251261
}
252262

253263
return true;

fastdeploy/backends/ort/ort_backend.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,8 @@ class OrtBackend : public BaseBackend {
6868
bool from_memory_buffer = false);
6969

7070
bool Infer(std::vector<FDTensor>& inputs,
71-
std::vector<FDTensor>* outputs) override;
71+
std::vector<FDTensor>* outputs,
72+
bool copy_to_fd = true) override;
7273

7374
int NumInputs() const override { return inputs_desc_.size(); }
7475

@@ -92,7 +93,7 @@ class OrtBackend : public BaseBackend {
9293
Ort::CustomOpDomain custom_op_domain_ = Ort::CustomOpDomain("Paddle");
9394
#endif
9495
OrtBackendOption option_;
95-
void CopyToCpu(const Ort::Value& value, FDTensor* tensor,
96-
const std::string& name);
96+
void OrtValueToFDTensor(const Ort::Value& value, FDTensor* tensor,
97+
const std::string& name, bool copy_to_fd);
9798
};
9899
} // namespace fastdeploy

fastdeploy/backends/paddle/paddle_backend.cc

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,8 @@ std::vector<TensorInfo> PaddleBackend::GetOutputInfos() {
194194
}
195195

196196
bool PaddleBackend::Infer(std::vector<FDTensor>& inputs,
197-
std::vector<FDTensor>* outputs) {
197+
std::vector<FDTensor>* outputs,
198+
bool copy_to_fd) {
198199
if (inputs.size() != inputs_desc_.size()) {
199200
FDERROR << "[PaddleBackend] Size of inputs(" << inputs.size()
200201
<< ") should keep same with the inputs of this model("
@@ -208,11 +209,18 @@ bool PaddleBackend::Infer(std::vector<FDTensor>& inputs,
208209
}
209210

210211
predictor_->Run();
212+
213+
// output share backend memory only support CPU or GPU
214+
if(option_.use_ipu) {
215+
copy_to_fd = true;
216+
}
211217
outputs->resize(outputs_desc_.size());
212218
for (size_t i = 0; i < outputs_desc_.size(); ++i) {
213219
auto handle = predictor_->GetOutputHandle(outputs_desc_[i].name);
214-
(*outputs)[i].is_pinned_memory = option_.enable_pinned_memory;
215-
CopyTensorToCpu(handle, &((*outputs)[i]));
220+
if(copy_to_fd) {
221+
(*outputs)[i].is_pinned_memory = option_.enable_pinned_memory;
222+
}
223+
PaddleTensorToFDTensor(handle, &((*outputs)[i]), copy_to_fd);
216224
}
217225
return true;
218226
}

fastdeploy/backends/paddle/paddle_backend.h

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -87,9 +87,12 @@ paddle_infer::PlaceType ConvertFDDeviceToPlace(Device device);
8787
// Share memory buffer with paddle_infer::Tensor from fastdeploy::FDTensor
8888
void ShareTensorFromFDTensor(paddle_infer::Tensor* tensor, FDTensor& fd_tensor);
8989

90-
// Copy memory data from paddle_infer::Tensor to fastdeploy::FDTensor
91-
void CopyTensorToCpu(std::unique_ptr<paddle_infer::Tensor>& tensor,
92-
FDTensor* fd_tensor);
90+
// convert paddle_infer::Tensor to fastdeploy::FDTensor
91+
// if copy_to_fd is true, copy memory data to FDTensor
92+
/// else share memory to FDTensor
93+
void PaddleTensorToFDTensor(std::unique_ptr<paddle_infer::Tensor>& tensor,
94+
FDTensor* fd_tensor,
95+
bool copy_to_fd);
9396

9497
// Convert data type from paddle inference to fastdeploy
9598
FDDataType PaddleDataTypeToFD(const paddle_infer::DataType& dtype);
@@ -108,7 +111,9 @@ class PaddleBackend : public BaseBackend {
108111
const PaddleBackendOption& option = PaddleBackendOption());
109112

110113
bool Infer(std::vector<FDTensor>& inputs,
111-
std::vector<FDTensor>* outputs) override;
114+
std::vector<FDTensor>* outputs,
115+
bool copy_to_fd = true) override;
116+
112117

113118
int NumInputs() const override { return inputs_desc_.size(); }
114119

fastdeploy/backends/paddle/util.cc

Lines changed: 30 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -61,25 +61,41 @@ void ShareTensorFromFDTensor(paddle_infer::Tensor* tensor,
6161
Str(fd_tensor.dtype).c_str());
6262
}
6363

64-
void CopyTensorToCpu(std::unique_ptr<paddle_infer::Tensor>& tensor,
65-
FDTensor* fd_tensor) {
64+
void PaddleTensorToFDTensor(std::unique_ptr<paddle_infer::Tensor>& tensor,
65+
FDTensor* fd_tensor,
66+
bool copy_to_fd) {
6667
auto fd_dtype = PaddleDataTypeToFD(tensor->type());
6768
std::vector<int64_t> shape;
6869
auto tmp_shape = tensor->shape();
6970
shape.assign(tmp_shape.begin(), tmp_shape.end());
70-
fd_tensor->Resize(shape, fd_dtype, tensor->name());
71-
if (fd_tensor->dtype == FDDataType::FP32) {
72-
tensor->CopyToCpu(static_cast<float*>(fd_tensor->MutableData()));
73-
return;
74-
} else if (fd_tensor->dtype == FDDataType::INT32) {
75-
tensor->CopyToCpu(static_cast<int32_t*>(fd_tensor->MutableData()));
76-
return;
77-
} else if (fd_tensor->dtype == FDDataType::INT64) {
78-
tensor->CopyToCpu(static_cast<int64_t*>(fd_tensor->MutableData()));
79-
return;
71+
if(copy_to_fd) {
72+
fd_tensor->Resize(shape, fd_dtype, tensor->name());
73+
if (fd_tensor->dtype == FDDataType::FP32) {
74+
tensor->CopyToCpu(static_cast<float*>(fd_tensor->MutableData()));
75+
return;
76+
} else if (fd_tensor->dtype == FDDataType::INT32) {
77+
tensor->CopyToCpu(static_cast<int32_t*>(fd_tensor->MutableData()));
78+
return;
79+
} else if (fd_tensor->dtype == FDDataType::INT64) {
80+
tensor->CopyToCpu(static_cast<int64_t*>(fd_tensor->MutableData()));
81+
return;
82+
}
83+
FDASSERT(false, "Unexpected data type(%s) while infer with PaddleBackend.",
84+
Str(fd_tensor->dtype).c_str());
85+
} else {
86+
paddle_infer::PlaceType place;
87+
int size = 0;
88+
// TODO(liqi): The tensor->data interface of paddle don't return device id
89+
// and don't support return void*.
90+
auto* out_data = tensor->data<uint8_t>(&place, &size);
91+
Device device = Device::CPU;
92+
if(place == paddle_infer::PlaceType::kGPU) {
93+
device = Device::GPU;
94+
}
95+
fd_tensor->SetExternalData(
96+
shape, fd_dtype,
97+
reinterpret_cast<void*>(out_data), device);
8098
}
81-
FDASSERT(false, "Unexpected data type(%s) while infer with PaddleBackend.",
82-
Str(fd_tensor->dtype).c_str());
8399
}
84100

85101
FDDataType PaddleDataTypeToFD(const paddle_infer::DataType& dtype) {

fastdeploy/backends/poros/poros_backend.cc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -188,7 +188,8 @@ bool PorosBackend::InitFromPoros(const std::string& model_file,
188188
}
189189

190190
bool PorosBackend::Infer(std::vector<FDTensor>& inputs,
191-
std::vector<FDTensor>* outputs) {
191+
std::vector<FDTensor>* outputs,
192+
bool copy_to_fd) {
192193
// Convert FD Tensor to PyTorch Tensor
193194
std::vector<torch::jit::IValue> poros_inputs;
194195
bool is_backend_cuda =

fastdeploy/backends/poros/poros_backend.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,9 @@ class PorosBackend : public BaseBackend {
8585
std::vector<std::vector<FDTensor>>& prewarm_tensors,
8686
const PorosBackendOption& option = PorosBackendOption());
8787

88-
bool Infer(std::vector<FDTensor>& inputs, std::vector<FDTensor>* outputs);
88+
bool Infer(std::vector<FDTensor>& inputs,
89+
std::vector<FDTensor>* outputs,
90+
bool copy_to_fd = true) override;
8991

9092
int NumInputs() const { return _numinputs; }
9193

fastdeploy/backends/rknpu/rknpu2/rknpu2_backend.cc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -289,7 +289,8 @@ std::vector<TensorInfo> RKNPU2Backend::GetOutputInfos() {
289289
}
290290

291291
bool RKNPU2Backend::Infer(std::vector<FDTensor>& inputs,
292-
std::vector<FDTensor>* outputs) {
292+
std::vector<FDTensor>* outputs,
293+
bool copy_to_fd) {
293294
int ret = RKNN_SUCC;
294295
// Judge whether the input and output size are the same
295296
if (inputs.size() != inputs_desc_.size()) {

fastdeploy/backends/rknpu/rknpu2/rknpu2_backend.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,8 @@ class RKNPU2Backend : public BaseBackend {
7272
std::vector<TensorInfo> GetInputInfos() override;
7373
std::vector<TensorInfo> GetOutputInfos() override;
7474
bool Infer(std::vector<FDTensor>& inputs,
75-
std::vector<FDTensor>* outputs) override;
75+
std::vector<FDTensor>* outputs,
76+
bool copy_to_fd = true) override;
7677

7778
private:
7879
// The object of rknn context.

0 commit comments

Comments
 (0)