Skip to content

Commit c3a9c1b

Browse files
committed
Add parameters support to InferResponse
* Infer response to track parameters * Add parameters to binding infer response * Rank parameters argument up among InferResponse constructor arguments * Add setting parameters to Triton response * Send response parameters only on non-error * Fix double declaration
1 parent b771f4f commit c3a9c1b

File tree

4 files changed

+120
-29
lines changed

4 files changed

+120
-29
lines changed

src/infer_response.cc

Lines changed: 66 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
1+
// Copyright 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
//
33
// Redistribution and use in source and binary forms, with or without
44
// modification, are permitted provided that the following conditions
@@ -39,8 +39,10 @@ namespace triton { namespace backend { namespace python {
3939

4040
InferResponse::InferResponse(
4141
const std::vector<std::shared_ptr<PbTensor>>& output_tensors,
42-
std::shared_ptr<PbError> error, const bool is_last_response, void* id)
43-
: error_(error), is_last_response_(is_last_response), id_(id)
42+
std::shared_ptr<PbError> error, std::string parameters,
43+
const bool is_last_response, void* id)
44+
: error_(error), is_last_response_(is_last_response), id_(id),
45+
parameters_(std::move(parameters))
4446
{
4547
for (auto& output : output_tensors) {
4648
if (!output) {
@@ -58,6 +60,12 @@ InferResponse::OutputTensors()
5860
return output_tensors_;
5961
}
6062

63+
std::string&
64+
InferResponse::Parameters()
65+
{
66+
return parameters_;
67+
}
68+
6169
bool
6270
InferResponse::HasError()
6371
{
@@ -106,6 +114,9 @@ InferResponse::SaveToSharedMemory(
106114
j++;
107115
}
108116
response_shm_ptr->id = id_;
117+
118+
parameters_shm_ = PbString::Create(shm_pool, parameters_);
119+
response_shm_ptr->parameters = parameters_shm_->ShmHandle();
109120
}
110121
}
111122

@@ -143,6 +154,8 @@ InferResponse::LoadFromSharedMemory(
143154

144155
std::shared_ptr<PbError> pb_error;
145156
std::vector<std::shared_ptr<PbTensor>> output_tensors;
157+
std::shared_ptr<PbString> parameters_shm;
158+
std::string parameters;
146159

147160
// If the error field is set, do not load output tensors from shared memory.
148161
if (response_shm_ptr->has_error && response_shm_ptr->is_error_set) {
@@ -154,33 +167,43 @@ InferResponse::LoadFromSharedMemory(
154167
bi::managed_external_buffer::handle_t* tensor_handle_shm =
155168
reinterpret_cast<bi::managed_external_buffer::handle_t*>(
156169
response_shm.data_.get() + sizeof(ResponseShm));
170+
{
157171
#ifdef TRITON_PB_STUB
158-
// Need to acquire the GIL to avoid hangs.
159-
py::gil_scoped_acquire acquire;
172+
// Need to acquire the GIL to avoid hangs.
173+
py::gil_scoped_acquire acquire;
160174
#endif
161-
for (size_t idx = 0; idx < requested_output_count; ++idx) {
162-
std::shared_ptr<PbTensor> pb_tensor = PbTensor::LoadFromSharedMemory(
163-
shm_pool, tensor_handle_shm[idx], open_cuda_handle);
164-
output_tensors.emplace_back(std::move(pb_tensor));
175+
for (size_t idx = 0; idx < requested_output_count; ++idx) {
176+
std::shared_ptr<PbTensor> pb_tensor = PbTensor::LoadFromSharedMemory(
177+
shm_pool, tensor_handle_shm[idx], open_cuda_handle);
178+
output_tensors.emplace_back(std::move(pb_tensor));
179+
}
165180
}
181+
182+
parameters_shm = std::move(
183+
PbString::LoadFromSharedMemory(shm_pool, response_shm_ptr->parameters));
184+
parameters = parameters_shm->String();
166185
}
167186

168187
return std::unique_ptr<InferResponse>(new InferResponse(
169188
response_shm, output_tensors, pb_error,
170-
response_shm_ptr->is_last_response, response_shm_ptr->id));
189+
response_shm_ptr->is_last_response, response_shm_ptr->id, parameters_shm,
190+
parameters));
171191
}
172192

173193
InferResponse::InferResponse(
174194
AllocatedSharedMemory<char>& response_shm,
175195
std::vector<std::shared_ptr<PbTensor>>& output_tensors,
176-
std::shared_ptr<PbError>& pb_error, const bool is_last_response, void* id)
196+
std::shared_ptr<PbError>& pb_error, const bool is_last_response, void* id,
197+
std::shared_ptr<PbString>& parameters_shm, std::string& parameters)
177198
{
178199
response_shm_ = std::move(response_shm);
179200
output_tensors_ = std::move(output_tensors);
180201
error_ = std::move(pb_error);
181202
shm_handle_ = response_shm_.handle_;
182203
id_ = id;
183204
is_last_response_ = is_last_response;
205+
parameters_shm_ = std::move(parameters_shm);
206+
parameters_ = std::move(parameters);
184207
}
185208

186209
std::shared_ptr<PbError>&
@@ -387,6 +410,38 @@ InferResponse::Send(
387410
cuda_copy |= cuda_used;
388411
}
389412

413+
if (!parameters_.empty()) {
414+
triton::common::TritonJson::Value param;
415+
THROW_IF_TRITON_ERROR(
416+
param.Parse(parameters_.c_str(), parameters_.length()));
417+
std::vector<std::string> param_keys;
418+
THROW_IF_TRITON_ERROR(param.Members(&param_keys));
419+
for (const auto& key : param_keys) {
420+
triton::common::TritonJson::Value value;
421+
if (!param.Find(key.c_str(), &value)) {
422+
throw PythonBackendException("Unexpected missing key on parameters");
423+
}
424+
if (value.IsString()) {
425+
std::string string_value;
426+
THROW_IF_TRITON_ERROR(value.AsString(&string_value));
427+
THROW_IF_TRITON_ERROR(TRITONBACKEND_ResponseSetStringParameter(
428+
response, key.c_str(), string_value.c_str()));
429+
} else if (value.IsInt()) {
430+
int64_t int_value = 0;
431+
THROW_IF_TRITON_ERROR(value.AsInt(&int_value));
432+
THROW_IF_TRITON_ERROR(TRITONBACKEND_ResponseSetIntParameter(
433+
response, key.c_str(), int_value));
434+
} else if (value.IsBool()) {
435+
bool bool_value = false;
436+
THROW_IF_TRITON_ERROR(value.AsBool(&bool_value));
437+
THROW_IF_TRITON_ERROR(TRITONBACKEND_ResponseSetBoolParameter(
438+
response, key.c_str(), bool_value));
439+
} else {
440+
throw PythonBackendException("Unsupported value type on parameters");
441+
}
442+
}
443+
}
444+
390445
#ifdef TRITON_ENABLE_GPU
391446
if (cuda_copy) {
392447
cudaStreamSynchronize(reinterpret_cast<cudaStream_t>(cuda_stream));

src/infer_response.h

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
1+
// Copyright 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
//
33
// Redistribution and use in source and binary forms, with or without
44
// modification, are permitted provided that the following conditions
@@ -38,6 +38,7 @@ namespace triton { namespace backend { namespace python {
3838

3939
struct ResponseShm {
4040
uint32_t outputs_size;
41+
bi::managed_external_buffer::handle_t parameters;
4142
bi::managed_external_buffer::handle_t error;
4243
bool has_error;
4344
// Indicates whether this error has a message or not.
@@ -72,9 +73,10 @@ class InferResponse {
7273
public:
7374
InferResponse(
7475
const std::vector<std::shared_ptr<PbTensor>>& output_tensors,
75-
std::shared_ptr<PbError> error = nullptr,
76+
std::shared_ptr<PbError> error = nullptr, std::string parameters = "",
7677
const bool is_last_response = true, void* id = nullptr);
7778
std::vector<std::shared_ptr<PbTensor>>& OutputTensors();
79+
std::string& Parameters();
7880
void SaveToSharedMemory(
7981
std::unique_ptr<SharedMemoryManager>& shm_pool, bool copy_gpu = true);
8082
static std::unique_ptr<InferResponse> LoadFromSharedMemory(
@@ -116,8 +118,8 @@ class InferResponse {
116118
InferResponse(
117119
AllocatedSharedMemory<char>& response_shm,
118120
std::vector<std::shared_ptr<PbTensor>>& output_tensors,
119-
std::shared_ptr<PbError>& pb_error, const bool is_last_response,
120-
void* id);
121+
std::shared_ptr<PbError>& pb_error, const bool is_last_response, void* id,
122+
std::shared_ptr<PbString>& parameters_shm, std::string& parameters);
121123
std::vector<std::shared_ptr<PbTensor>> output_tensors_;
122124

123125
std::shared_ptr<PbError> error_;
@@ -128,6 +130,9 @@ class InferResponse {
128130
bool is_last_response_;
129131
// Representing the request id that the response was created from.
130132
void* id_;
133+
134+
std::shared_ptr<PbString> parameters_shm_;
135+
std::string parameters_;
131136
};
132137

133138
}}} // namespace triton::backend::python

src/pb_stub.cc

Lines changed: 33 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
1+
// Copyright 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
//
33
// Redistribution and use in source and binary forms, with or without
44
// modification, are permitted provided that the following conditions
@@ -1869,16 +1869,43 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module)
18691869
py::class_<InferResponse, std::shared_ptr<InferResponse>>(
18701870
module, "InferenceResponse")
18711871
.def(
1872-
py::init<
1873-
const std::vector<std::shared_ptr<PbTensor>>&,
1874-
std::shared_ptr<PbError>>(),
1872+
py::init([](const std::vector<std::shared_ptr<PbTensor>>&
1873+
output_tensors,
1874+
const std::shared_ptr<PbError>& error,
1875+
const py::object& parameters_) {
1876+
py::dict parameters =
1877+
PyDefaultArgumentToMutableType<py::dict>(parameters_);
1878+
for (const auto& pair : parameters) {
1879+
if (!py::isinstance<py::str>(pair.first)) {
1880+
throw PythonBackendException(
1881+
"Expect parameters keys to have type str, found type " +
1882+
std::string(py::str(pair.first.get_type())));
1883+
}
1884+
if (!py::isinstance<py::bool_>(pair.second) &&
1885+
!py::isinstance<py::int_>(pair.second) &&
1886+
!py::isinstance<py::str>(pair.second)) {
1887+
throw PythonBackendException(
1888+
"Expect parameters values to have type bool/int/str, found "
1889+
"type " +
1890+
std::string(py::str(pair.second.get_type())));
1891+
}
1892+
}
1893+
py::module_ py_json = py::module_::import("json");
1894+
std::string parameters_str =
1895+
py::str(py_json.attr("dumps")(parameters));
1896+
1897+
return std::make_shared<InferResponse>(
1898+
output_tensors, error, parameters_str /* parameters */);
1899+
}),
18751900
py::arg("output_tensors") = py::list(),
1876-
py::arg("error") = static_cast<std::shared_ptr<PbError>>(nullptr))
1901+
py::arg("error") = static_cast<std::shared_ptr<PbError>>(nullptr),
1902+
py::arg("parameters") = py::dict())
18771903
.def(
18781904
"output_tensors", &InferResponse::OutputTensors,
18791905
py::return_value_policy::reference)
18801906
.def("has_error", &InferResponse::HasError)
1881-
.def("error", &InferResponse::Error);
1907+
.def("error", &InferResponse::Error)
1908+
.def("parameters", &InferResponse::Parameters);
18821909

18831910
py::class_<ResponseSender, std::shared_ptr<ResponseSender>>(
18841911
module, "InferenceResponseSender")

src/request_executor.cc

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
1+
// Copyright 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
//
33
// Redistribution and use in source and binary forms, with or without
44
// modification, are permitted provided that the following conditions
@@ -153,20 +153,22 @@ InferResponseComplete(
153153
output_tensors.clear();
154154
}
155155

156+
// TODO: [DLIS-7864] Pass response parameters from BLS response.
156157
if (!infer_payload->IsDecoupled()) {
157158
infer_response = std::make_unique<InferResponse>(
158-
output_tensors, pb_error, true /* is_last_response */);
159+
output_tensors, pb_error, "" /* parameters */,
160+
true /* is_last_response */);
159161
} else {
160162
if ((flags & TRITONSERVER_RESPONSE_COMPLETE_FINAL) == 0) {
161163
// Not the last response.
162164
infer_response = std::make_unique<InferResponse>(
163-
output_tensors, pb_error, false /* is_last_response */,
164-
userp /* id */);
165+
output_tensors, pb_error, "" /* parameters */,
166+
false /* is_last_response */, userp /* id */);
165167
} else {
166168
// The last response.
167169
infer_response = std::make_unique<InferResponse>(
168-
output_tensors, pb_error, true /* is_last_response */,
169-
userp /* id */);
170+
output_tensors, pb_error, "" /* parameters */,
171+
true /* is_last_response */, userp /* id */);
170172
}
171173
}
172174

@@ -178,11 +180,13 @@ InferResponseComplete(
178180
(flags & TRITONSERVER_RESPONSE_COMPLETE_FINAL) != 0) {
179181
// An empty response may be the last response for decoupled models.
180182
infer_response = std::make_unique<InferResponse>(
181-
output_tensors, pb_error, true /* is_last_response */, userp /* id */);
183+
output_tensors, pb_error, "" /* parameters */,
184+
true /* is_last_response */, userp /* id */);
182185
} else {
183186
pb_error = std::make_shared<PbError>("Unexpected empty response.");
184187
infer_response = std::make_unique<InferResponse>(
185-
output_tensors, pb_error, true /* is_last_response */, userp /* id */);
188+
output_tensors, pb_error, "" /* parameters */,
189+
true /* is_last_response */, userp /* id */);
186190
}
187191

188192
infer_payload->SetValue(std::move(infer_response));

0 commit comments

Comments
 (0)