triton-inference-server · yinggeh · Apr 29, 2025 · Apr 29, 2025 · yinggeh · Apr 29, 2025
diff --git a/src/constants.h b/src/constants.h
@@ -62,7 +62,7 @@ constexpr char kPythonBackend[] = "python";
 
 #ifdef TRITON_ENABLE_ENSEMBLE
 constexpr char kEnsemblePlatform[] = "ensemble";
-constexpr uint64_t ENSEMBLE_CB_POOL_SIZE = 8u;
 void 
 EnsembleContext::RequestComplete( 
     TRITONSERVER_InferenceRequest* request, const uint32_t flags, void* userp) 
 { 
   auto request_tracker = reinterpret_cast<RequestTracker*>(userp); 
   auto pool = request_tracker->CallbackPool(); 
   auto fn = [request, flags, request_tracker]() { 
     if ((flags & TRITONSERVER_REQUEST_RELEASE_ALL) != 0) { 
       LOG_TRITONSERVER_ERROR( 
           TRITONSERVER_InferenceRequestDelete(request), 
           "deleting ensemble inference request"); 
       if (request_tracker->DecrementCounter()) { 
         delete request_tracker; 
       } 
     } 
   }; 
   // Attempt to enqueue the callback. If all workers are busy and queue is at 
   // capacity, execute the callback immediately. 
   if (pool->TaskQueueSize() < pool->Size()) { 
     pool->Enqueue(fn); 
   } else { 
     fn(); 
   } 
 } 
   // Attempt to enqueue the callback. If all workers are busy and queue is at 
   // capacity, execute the callback immediately. 
   if (pool->TaskQueueSize() < pool->Size()) { 
     pool->Enqueue(fn); 
   } else { 
     fn(); 
   } 
 } 
 void 
 EnsembleContext::RequestComplete( 
     TRITONSERVER_InferenceRequest* request, const uint32_t flags, void* userp) 
 { 
   auto request_tracker = reinterpret_cast<RequestTracker*>(userp); 
   auto pool = request_tracker->CallbackPool(); 
   auto fn = [request, flags, request_tracker]() { 
     if ((flags & TRITONSERVER_REQUEST_RELEASE_ALL) != 0) { 
       LOG_TRITONSERVER_ERROR( 
           TRITONSERVER_InferenceRequestDelete(request), 
           "deleting ensemble inference request"); 
       if (request_tracker->DecrementCounter()) { 
         delete request_tracker; 
       } 
     } 
   }; 
  
   // Attempt to enqueue the callback. If all workers are busy and queue is at 
   // capacity, execute the callback immediately. 
   if (pool->TaskQueueSize() < pool->Size()) { 
     pool->Enqueue(fn); 
   } else { 
     fn(); 
   } 
 } 
   // Attempt to enqueue the callback. If all workers are busy and queue is at 
   // capacity, execute the callback immediately. 
   if (pool->TaskQueueSize() < pool->Size()) { 
     pool->Enqueue(fn); 
   } else { 
     fn(); 
   } 
 } 
+constexpr uint64_t ENSEMBLE_CB_POOL_SIZE = 4u;
 #endif  // TRITON_ENABLE_ENSEMBLE
 
 constexpr char kTensorRTExecutionAccelerator[] = "tensorrt";

diff --git a/src/ensemble_scheduler/ensemble_scheduler.cc b/src/ensemble_scheduler/ensemble_scheduler.cc
@@ -331,8 +331,6 @@ class EnsembleContext {
   void CacheEnsembleTopLevelRequest(
       std::unique_ptr<InferenceResponse>& response);
 
-  triton::common::ThreadPool* CallbackPool() const { return callback_pool_; }
-
   InferenceServer* is_;
 
   EnsembleInfo* info_;
@@ -382,10 +380,6 @@ class EnsembleContext {
       TRITONSERVER_ResponseAllocator,
       decltype(&TRITONSERVER_ResponseAllocatorDelete)>
       allocator_;
-
-  // The thread pool used to execute ensemble callbacks and reduce e2e latency.
-  // The thread pool is managed by InferenceServer.
-  triton::common::ThreadPool* const callback_pool_;
 };
 
 EnsembleContext::EnsembleContext(
@@ -394,8 +388,7 @@ EnsembleContext::EnsembleContext(
     EnsembleInfo* info, std::unique_ptr<InferenceRequest>& request,
     cudaStream_t stream, triton::common::ThreadPool* callback_pool)
     : is_(is), info_(info), stream_(stream), inflight_step_counter_(0),
-      allocator_(nullptr, TRITONSERVER_ResponseAllocatorDelete),
-      callback_pool_(callback_pool)
+      allocator_(nullptr, TRITONSERVER_ResponseAllocatorDelete)
 {
   uint64_t compute_start_ns = 0;
   INFER_STATS_SET_TIMESTAMP(compute_start_ns);
@@ -642,26 +635,14 @@ void
 EnsembleContext::ResponseComplete(
     TRITONSERVER_InferenceResponse* response, const uint32_t flags, void* userp)
 {
-  auto step_raw_ptr = reinterpret_cast<Step*>(userp);
-  auto pool = step_raw_ptr->ctx_->CallbackPool();
-  auto fn = [response, flags, step_raw_ptr]() {
-    auto step_ptr = std::unique_ptr<Step>(step_raw_ptr);
-    step_ptr->response_flags_ = flags;
-    step_ptr->response_ = response;
-
-    EnsembleContext::Proceed(step_ptr->ctx_, step_ptr);
-    // Expecting more responses
-    if ((flags & TRITONSERVER_RESPONSE_COMPLETE_FINAL) == 0) {
-      step_ptr.release();
-    }
-  };
-
-  // Attempt to enqueue the callback. If all workers are busy and queue is at
-  // capacity, execute the callback immediately.
-  if (pool->TaskQueueSize() < pool->Size()) {
-    pool->Enqueue(fn);
-  } else {
-    fn();
+  auto step_ptr = std::unique_ptr<Step>(reinterpret_cast<Step*>(userp));
+  step_ptr->response_flags_ = flags;
+  step_ptr->response_ = response;
+
+  EnsembleContext::Proceed(step_ptr->ctx_, step_ptr);
+  // Expecting more responses
+  if ((flags & TRITONSERVER_RESPONSE_COMPLETE_FINAL) == 0) {
+    step_ptr.release();
   }
 }