diff --git a/include/triton/core/tritonserver.h b/include/triton/core/tritonserver.h index efbac44be..09ff58126 100644 --- a/include/triton/core/tritonserver.h +++ b/include/triton/core/tritonserver.h @@ -1,4 +1,4 @@ -// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2020-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -1041,7 +1041,8 @@ TRITONSERVER_InferenceRequestNew( struct TRITONSERVER_Server* server, const char* model_name, const int64_t model_version); -/// Delete an inference request object. +/// Delete an inference request object. The request object must be +/// released before deletion. /// /// \param inference_request The request object. /// \return a TRITONSERVER_Error indicating success or failure. diff --git a/src/infer_request.cc b/src/infer_request.cc index 9ec7d43b3..4d0ca8096 100644 --- a/src/infer_request.cc +++ b/src/infer_request.cc @@ -112,7 +112,7 @@ InferenceRequest::InferenceRequest( SetPriority(0); // Outer-most release callback to ensure a request has been taken, this // callback won't be invoked, if certain flags are set. - release_callbacks_.emplace_back( + release_callbacks_.emplace_back(std::make_pair( [](std::unique_ptr& request, const uint32_t flags) -> Status { if (flags & TRITONSERVER_REQUEST_RELEASE_RESCHEDULE) { @@ -123,7 +123,8 @@ InferenceRequest::InferenceRequest( "configured to handle such a flag."); } return Status::Success; - }); + }, + false)); } Status @@ -476,9 +477,16 @@ InferenceRequest::Release( { // Invoke the release callbacks added internally before releasing the // request to user provided callback. - for (auto it = request->release_callbacks_.rbegin(); - it != request->release_callbacks_.rend(); it++) { - RETURN_IF_ERROR((*it)(request, release_flags)); + + // Invoke callbacks in reverse order. Evict internal callbacks for reusing + // inference request object. + auto& release_callbacks = request->release_callbacks_; + for (int i = release_callbacks.size() - 1; i >= 0; --i) { + auto [release_fn, is_internal] = release_callbacks[i]; + if (is_internal) { + release_callbacks.erase(release_callbacks.begin() + i); + } + release_fn(request, release_flags); if (request == nullptr) { return Status::Success; } @@ -500,6 +508,7 @@ InferenceRequest::Release( "Failed to set released state"); void* userp = request->release_userp_; auto& release_fn = request->release_fn_; + LOG_INFO << "userp " << userp << std::endl; release_fn( reinterpret_cast(request.release()), release_flags, userp); diff --git a/src/infer_request.h b/src/infer_request.h index 94054dc39..1c7e83d6d 100644 --- a/src/infer_request.h +++ b/src/infer_request.h @@ -548,7 +548,7 @@ class InferenceRequest { // and they will be invoked in reversed order. Status AddInternalReleaseCallback(InternalReleaseFn&& callback) { - release_callbacks_.emplace_back(std::move(callback)); + release_callbacks_.emplace_back(std::make_pair(std::move(callback), true)); return Status::Success; } @@ -832,8 +832,9 @@ class InferenceRequest { TRITONSERVER_InferenceRequestReleaseFn_t release_fn_; void* release_userp_; - // Additional release callbacks invoked before 'release_fn_'. - std::vector release_callbacks_; + // Additional release callbacks invoked before 'release_fn_'. Set boolean to + // true if release callback is internal and should be evicted after invoking. + std::vector> release_callbacks_; // Delegator to be invoked on sending responses. std::function&&, const uint32_t)>