triton-inference-server
diff --git a/‎src/backend_model.cc
Lines changed: 1 addition & 4 deletions b/‎src/backend_model.cc
Lines changed: 1 addition & 4 deletions
diff --git a/‎src/backend_model_instance.cc
Lines changed: 40 additions & 8 deletions b/‎src/backend_model_instance.cc
Lines changed: 40 additions & 8 deletions
diff --git a/‎src/backend_model_instance.h
Lines changed: 5 additions & 3 deletions b/‎src/backend_model_instance.h
Lines changed: 5 additions & 3 deletions
diff --git a/‎src/constants.h
Lines changed: 4 additions & 1 deletion b/‎src/constants.h
Lines changed: 4 additions & 1 deletion
diff --git a/‎src/dynamic_batch_scheduler.cc
Lines changed: 9 additions & 19 deletions b/‎src/dynamic_batch_scheduler.cc
Lines changed: 9 additions & 19 deletions
diff --git a/‎src/dynamic_batch_scheduler.h
Lines changed: 3 additions & 8 deletions b/‎src/dynamic_batch_scheduler.h
Lines changed: 3 additions & 8 deletions
diff --git a/‎src/ensemble_scheduler.cc
Lines changed: 4 additions & 1 deletion b/‎src/ensemble_scheduler.cc
Lines changed: 4 additions & 1 deletion
@@ -687,9 +687,7 @@ TritonModel::SetConfiguredScheduler(
     RETURN_IF_ERROR(DynamicBatchScheduler::Create(
         this, nullptr, 0 /*nice*/, true /* dynamic_batching_enabled */,
         config_.max_batch_size(), enforce_equal_shape_tensors,
-        config_.dynamic_batching(),
-        config_.response_cache().enable() /* response_cache_enable */,
-        &scheduler));
+        config_.dynamic_batching(), &scheduler));
   } else {
     // Default scheduler. Use dynamic batch scheduler (with batching
     // disabled) as the default scheduler.
@@ -699,7 +697,6 @@ TritonModel::SetConfiguredScheduler(
         std::unordered_map<
             std::string, bool>() /* enforce_equal_shape_tensors */,
         false /* preserve_ordering */,
-        config_.response_cache().enable() /* response_cache_enable */,
         std::set<int32_t>() /* preferred_batch_sizes */,
         0 /* max_queue_delay_microseconds */, &scheduler));
   }
 
@@ -26,6 +26,8 @@
 
 #include "backend_model_instance.h"
 
+#include "status.h"
+
 #ifndef _WIN32
 #include <sys/resource.h>
 #include <sys/syscall.h>
@@ -185,7 +187,7 @@ TritonModelInstance::TritonModelInstance(
     // Let every metric reporter know if caching is enabled to correctly include
     // cache miss time into request duration on cache misses.
     const bool response_cache_enabled =
-        model_->Config().response_cache().enable() &&
+        model_->ResponseCacheEnabled() &&
         model_->Server()->ResponseCacheEnabled();
     MetricModelReporter::Create(
         model_->Name(), model_->Version(), id, response_cache_enabled,
@@ -534,25 +536,55 @@ TritonModelInstance::GenerateWarmupData()
   return Status::Success;
 }
 
-void
+Status
+TritonModelInstance::PrepareRequestsForExecution(
+    std::vector<std::unique_ptr<InferenceRequest>>& requests)
+{
+  for (auto& r : requests) {
+    // Load the input states for the inference request.
+    RETURN_IF_ERROR(r->LoadInputStates());
+    // Set request state to signify that request is no longer pending.
+    RETURN_IF_ERROR(r->SetState(InferenceRequest::State::EXECUTING));
+  }
+
+  return Status::Success;
+}
+
+Status
+TritonModelInstance::PrepareRequestsOrRespond(
+    std::vector<std::unique_ptr<InferenceRequest>>& requests)
+{
+  auto status = PrepareRequestsForExecution(requests);
+  // If any errors occurred, respond with error for each request.
+  if (!status.IsOk()) {
+    for (auto& r : requests) {
+      InferenceRequest::RespondIfError(r, status, true /* release_requests */);
+    }
+    // Log a single error for batch of requests for better visibility
+    LOG_STATUS_ERROR(status, "Requests failed pre-execution checks");
+  }
+
+  return status;
+}
+
+Status
 TritonModelInstance::Schedule(
-    std::vector<std::unique_ptr<InferenceRequest>>&& requests,
-    const std::function<void()>& OnCompletion)
+    std::vector<std::unique_ptr<InferenceRequest>>&& requests)
 {
+  // Prepare requests for execution, respond to requests if any error occur.
+  RETURN_IF_ERROR(PrepareRequestsOrRespond(requests));
+
   // Use a thread local vector to avoid needing to malloc each
   // time an inference is run.
   thread_local std::vector<TRITONBACKEND_Request*> triton_requests(1024);
   triton_requests.clear();
   for (auto& r : requests) {
-    // Load the input states for the inference request.
-    r->LoadInputStates();
     triton_requests.push_back(
         reinterpret_cast<TRITONBACKEND_Request*>(r.release()));
   }
 
   Execute(triton_requests);
-
-  OnCompletion();
+  return Status::Success;
 }
 
 Status
 
@@ -116,9 +116,7 @@ class TritonModelInstance {
 
   Status Initialize();
   Status WarmUp();
-  void Schedule(
-      std::vector<std::unique_ptr<InferenceRequest>>&& requests,
-      const std::function<void()>& OnCompletion);
+  Status Schedule(std::vector<std::unique_ptr<InferenceRequest>>&& requests);
 
   TritonModel* Model() const { return model_; }
   void* State() { return state_; }
@@ -220,6 +218,10 @@ class TritonModelInstance {
       const bool device_blocking);
   Status GenerateWarmupData();
 
+  Status PrepareRequestsForExecution(
+      std::vector<std::unique_ptr<InferenceRequest>>& requests);
+  Status PrepareRequestsOrRespond(
+      std::vector<std::unique_ptr<InferenceRequest>>& requests);
   void Execute(std::vector<TRITONBACKEND_Request*>& triton_requests);
 
   std::shared_ptr<TritonBackendThread> triton_backend_thread_;
 
@@ -79,6 +79,9 @@ constexpr char kMetricsLabelGpuUuid[] = "gpu_uuid";
 constexpr char kWarmupDataFolder[] = "warmup";
 constexpr char kInitialStateFolder[] = "initial_state";
 
+// Metric names
+constexpr char kPendingRequestMetric[] = "inf_pending_request_count";
+
 constexpr uint64_t NANOS_PER_SECOND = 1000000000;
 constexpr uint64_t NANOS_PER_MILLIS = 1000000;
 constexpr int MAX_GRPC_MESSAGE_SIZE = INT32_MAX;
@@ -90,7 +93,7 @@ constexpr size_t CUDA_IPC_STRUCT_SIZE = 64;
 // MetricModelReporter expects a device ID for GPUs, but we reuse this device
 // ID for other metrics as well such as for CPU and Response Cache metrics
 constexpr int METRIC_REPORTER_ID_CPU = -1;
-constexpr int METRIC_REPORTER_ID_RESPONSE_CACHE = -2;
+constexpr int METRIC_REPORTER_ID_UTILITY = -2;
 #endif
 
 // Note: This can be replaced with std::byte starting in c++17
 
@@ -59,7 +59,7 @@ DynamicBatchScheduler::DynamicBatchScheduler(
     TritonModel* model, TritonModelInstance* model_instance,
     const bool dynamic_batching_enabled, const int32_t max_batch_size,
     const std::unordered_map<std::string, bool>& enforce_equal_shape_tensors,
-    const bool preserve_ordering, const bool response_cache_enable,
+    const bool preserve_ordering,
     const std::set<int32_t>& preferred_batch_sizes,
     const uint64_t max_queue_delay_microseconds,
     const inference::ModelQueuePolicy& default_queue_policy,
@@ -79,16 +79,8 @@ DynamicBatchScheduler::DynamicBatchScheduler(
   rate_limiter_ = model_->Server()->GetRateLimiter();
   // Both the server and model config should specify
   // caching enabled for model to utilize response cache.
-  response_cache_enabled_ =
-      response_cache_enable && model_->Server()->ResponseCacheEnabled();
-#ifdef TRITON_ENABLE_METRICS
-  // Initialize metric reporter for cache statistics if cache enabled
-  if (response_cache_enabled_) {
-    MetricModelReporter::Create(
-        model_name_, model_->Version(), METRIC_REPORTER_ID_RESPONSE_CACHE,
-        response_cache_enabled_, model_->Config().metric_tags(), &reporter_);
-  }
-#endif  // TRITON_ENABLE_METRICS
+  response_cache_enabled_ = model_->ResponseCacheEnabled() &&
+                            model_->Server()->ResponseCacheEnabled();
   max_preferred_batch_size_ = 0;
   for (const auto size : preferred_batch_sizes_) {
     max_preferred_batch_size_ =
@@ -108,7 +100,7 @@ DynamicBatchScheduler::Create(
     TritonModel* model, TritonModelInstance* model_instance, const int nice,
     const bool dynamic_batching_enabled, const int32_t max_batch_size,
     const std::unordered_map<std::string, bool>& enforce_equal_shape_tensors,
-    const bool preserve_ordering, const bool response_cache_enable,
+    const bool preserve_ordering,
     const std::set<int32_t>& preferred_batch_sizes,
     const uint64_t max_queue_delay_microseconds,
     std::unique_ptr<Scheduler>* scheduler)
@@ -122,8 +114,7 @@ DynamicBatchScheduler::Create(
 
   return Create(
       model, model_instance, nice, dynamic_batching_enabled, max_batch_size,
-      enforce_equal_shape_tensors, batcher_config, response_cache_enable,
-      scheduler);
+      enforce_equal_shape_tensors, batcher_config, scheduler);
 }
 
 Status
@@ -132,7 +123,7 @@ DynamicBatchScheduler::Create(
     const bool dynamic_batching_enabled, const int32_t max_batch_size,
     const std::unordered_map<std::string, bool>& enforce_equal_shape_tensors,
     const inference::ModelDynamicBatching& batcher_config,
-    const bool response_cache_enable, std::unique_ptr<Scheduler>* scheduler)
+    std::unique_ptr<Scheduler>* scheduler)
 {
   std::set<int32_t> preferred_batch_sizes;
   for (const auto size : batcher_config.preferred_batch_size()) {
@@ -142,8 +133,7 @@ DynamicBatchScheduler::Create(
   DynamicBatchScheduler* dyna_sched = new DynamicBatchScheduler(
       model, model_instance, dynamic_batching_enabled, max_batch_size,
       enforce_equal_shape_tensors, batcher_config.preserve_ordering(),
-      response_cache_enable, preferred_batch_sizes,
-      batcher_config.max_queue_delay_microseconds(),
+      preferred_batch_sizes, batcher_config.max_queue_delay_microseconds(),
       batcher_config.default_queue_policy(), batcher_config.priority_levels(),
       batcher_config.priority_queue_policy());
   std::unique_ptr<DynamicBatchScheduler> sched(dyna_sched);
@@ -681,7 +671,7 @@ DynamicBatchScheduler::DelegateResponse(
             // Use model_ to update stats directly because request object can be
             // released by the backend before getting to this callback.
             model_->MutableStatsAggregator()->UpdateSuccessCacheMiss(
-                reporter_.get(), cache_miss_ns);
+                model_->MetricReporter().get(), cache_miss_ns);
 #endif  // TRITON_ENABLE_STATS
             if (!status.IsOk()) {
               LOG_ERROR << "Failed to insert key [" << key
@@ -736,7 +726,7 @@ DynamicBatchScheduler::CacheLookUp(
 #ifdef TRITON_ENABLE_STATS
     // Update model metrics/stats on cache hits
     // Backends will update metrics as normal on cache misses
-    request->ReportStatisticsCacheHit(reporter_.get());
+    request->ReportStatisticsCacheHit(model_->MetricReporter().get());
 #endif  // TRITON_ENABLE_STATS
   }
 }
 
@@ -55,7 +55,7 @@ class DynamicBatchScheduler : public Scheduler {
       TritonModel* model, TritonModelInstance* model_instance, const int nice,
       const bool dynamic_batching_enabled, const int32_t max_batch_size,
       const std::unordered_map<std::string, bool>& enforce_equal_shape_tensors,
-      const bool preserve_ordering, const bool response_cache_enable,
+      const bool preserve_ordering,
       const std::set<int32_t>& preferred_batch_sizes,
       const uint64_t max_queue_delay_microseconds,
       std::unique_ptr<Scheduler>* scheduler);
@@ -68,7 +68,7 @@ class DynamicBatchScheduler : public Scheduler {
       const bool dynamic_batching_enabled, const int32_t max_batch_size,
       const std::unordered_map<std::string, bool>& enforce_equal_shape_tensors,
       const inference::ModelDynamicBatching& batcher_config,
-      const bool response_cache_enable, std::unique_ptr<Scheduler>* scheduler);
+      std::unique_ptr<Scheduler>* scheduler);
 
   ~DynamicBatchScheduler();
 
@@ -88,14 +88,12 @@ class DynamicBatchScheduler : public Scheduler {
   // \see Scheduler::Stop()
   void Stop() override { stop_ = true; }
 
-  MetricModelReporter* MetricReporter() const { return reporter_.get(); }
-
  private:
   DynamicBatchScheduler(
       TritonModel* model, TritonModelInstance* model_instance,
       const bool dynamic_batching_enabled, const int32_t max_batch_size,
       const std::unordered_map<std::string, bool>& enforce_equal_shape_tensors,
-      const bool preserve_ordering, const bool response_cache_enable,
+      const bool preserve_ordering,
       const std::set<int32_t>& preferred_batch_sizes,
       const uint64_t max_queue_delay_microseconds,
       const inference::ModelQueuePolicy& default_queue_policy,
@@ -188,9 +186,6 @@ class DynamicBatchScheduler : public Scheduler {
 
   // Preserves the order in which responses are finalized
   std::mutex finalize_mtx_;
-
-  // Reporter for metrics, or nullptr if no metrics should be reported
-  std::shared_ptr<MetricModelReporter> reporter_;
 };
 
 }}  // namespace triton::core
@@ -222,7 +222,7 @@ struct TensorData {
 // scope after the step's callback is finished. The step's callback will
 // schedule new steps if available and the last step will finish the ensemble
 // request.
-// So we don't have to maintian the context in scheduler as the shared_ptr
+// So we don't have to maintain the context in scheduler as the shared_ptr
 // will destroy the context for us if there are no "in-flight" steps.
 class EnsembleContext {
  public:
@@ -1319,6 +1319,9 @@ EnsembleScheduler::Enqueue(std::unique_ptr<InferenceRequest>& request)
   // Add additional callback to keep track of in-flight count
   ++inflight_count_;
   request->AddInternalReleaseCallback([this]() { --inflight_count_; });
+  // Consider the top-level "ensemble" request executing once passed to a
+  // composing model. Composing model requests will track their own states.
+  RETURN_IF_ERROR(request->SetState(InferenceRequest::State::EXECUTING));
   std::shared_ptr<EnsembleContext> context(new EnsembleContext(
       metric_reporter_.get(), stats_aggregator_, is_, info_.get(), request,
       stream_));