fix: Fix gRPC cancellation race condition (#8078)

yinggeh · web-flow · commit 42811e0cd3ed · 2025-04-07T18:55:23.000-07:00
diff --git a/qa/L0_request_cancellation/grpc_cancellation_test.py b/qa/L0_request_cancellation/grpc_cancellation_test.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 
-# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -202,12 +202,34 @@ def test_grpc_async_infer_response_complete_during_cancellation(self):
         )  # ensure the cancellation is processed
         self._assert_callback_cancelled()
 
-    def test_grpc_async_infer_cancellation_during_response_complete(self):
+    def test_grpc_async_infer_cancellation_before_finish_0(self):
+        # First version of test_grpc_async_infer_cancellation_before_finish
+        # Cancellation notification is processed before the final response state.
         # long test
-        self.test_duration_delta = 2.5
+        self.test_duration_delta = 2
         delay_notification_sec = (
             int(os.getenv("TRITONSERVER_DELAY_GRPC_NOTIFICATION")) / 1000
         )
+        future = self._client.async_infer(
+            model_name=self._model_name,
+            inputs=self._inputs,
+            callback=self._callback,
+            outputs=self._outputs,
+        )
+        # ensure the cancellation is received between InferResponseComplete checking cancellation and Finish
+        time.sleep(self._model_delay + 2)
+        future.cancel()
+        time.sleep(delay_notification_sec + 1)  # ensure the cancellation is processed
+        self._assert_callback_cancelled()
+
+    def test_grpc_async_infer_cancellation_before_finish_1(self):
+        # Second version of test_grpc_async_infer_cancellation_before_finish
+        # Cancellation notification is processed after the final response state.
+        # long test
+        self.test_duration_delta = 2
+        delay_process_entry_sec = (
+            int(os.getenv("TRITONSERVER_DELAY_GRPC_PROCESS_ENTRY")) / 1000
+        )
         delay_response_completion_sec = (
             int(os.getenv("TRITONSERVER_DELAY_RESPONSE_COMPLETION")) / 1000
         )
@@ -218,13 +240,38 @@ def test_grpc_async_infer_cancellation_during_response_complete(self):
             outputs=self._outputs,
         )
         # ensure the cancellation is received between InferResponseComplete checking cancellation and Finish
-        time.sleep(self._model_delay + 2)
+        time.sleep(self._model_delay + delay_process_entry_sec + 2)
         future.cancel()
         time.sleep(
-            delay_notification_sec + delay_response_completion_sec
+            delay_response_completion_sec
         )  # ensure the cancellation is processed
         self._assert_callback_cancelled()
 
+    def test_grpc_async_infer_cancellation_before_response_complete_and_process_after_final_response(
+        self,
+    ):
+        # Received cancellation before InferResponseComplete and the notification
+        # state is processed after processing final response state.
+        # long test
+        self.test_duration_delta = 2
+        delay_notification_sec = (
+            int(os.getenv("TRITONSERVER_DELAY_GRPC_NOTIFICATION")) / 1000
+        )
+        delay_response_complete_exec_sec = (
+            int(os.getenv("TRITONSERVER_DELAY_RESPONSE_COMPLETE_EXEC")) / 1000
+        )
+        future = self._client.async_infer(
+            model_name=self._model_name,
+            inputs=self._inputs,
+            callback=self._callback,
+            outputs=self._outputs,
+        )
+        # ensure the cancellation is received before InferResponseComplete checking cancellation
+        time.sleep(self._model_delay + 2)
+        future.cancel()
+        time.sleep(delay_notification_sec + 1)  # ensure the cancellation is processed
+        self._assert_callback_cancelled()
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_request_cancellation/test.sh b/qa/L0_request_cancellation/test.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -84,20 +84,28 @@ for TEST_CASE in "test_grpc_async_infer" \
                     "test_aio_grpc_stream_infer" \
                     "test_grpc_async_infer_cancellation_at_step_start" \
                     "test_grpc_async_infer_response_complete_during_cancellation" \
-                    "test_grpc_async_infer_cancellation_during_response_complete"; do
+                    "test_grpc_async_infer_cancellation_before_finish_0" \
+                    "test_grpc_async_infer_cancellation_before_finish_1" \
+                    "test_grpc_async_infer_cancellation_before_response_complete_and_process_after_final_response"; do
     TEST_LOG="./grpc_cancellation_test.$TEST_CASE.log"
     SERVER_LOG="grpc_cancellation_test.$TEST_CASE.server.log"
     if [ "$TEST_CASE" == "test_grpc_async_infer_cancellation_at_step_start" ]; then
         export TRITONSERVER_DELAY_GRPC_PROCESS=5000
     elif [ "$TEST_CASE" == "test_grpc_async_infer_response_complete_during_cancellation" ]; then
         export TRITONSERVER_DELAY_GRPC_NOTIFICATION=5000
         export TRITONSERVER_DELAY_GRPC_ENQUEUE=5000
-    elif [ "$TEST_CASE" == "test_grpc_async_infer_cancellation_during_response_complete" ]; then
+    elif [ "$TEST_CASE" == "test_grpc_async_infer_cancellation_before_finish_0" ]; then
         export TRITONSERVER_DELAY_GRPC_NOTIFICATION=5000
         export TRITONSERVER_DELAY_RESPONSE_COMPLETION=5000
+    elif [ "$TEST_CASE" == "test_grpc_async_infer_cancellation_before_finish_1" ]; then
+        export TRITONSERVER_DELAY_GRPC_PROCESS_ENTRY=1000
+        export TRITONSERVER_DELAY_RESPONSE_COMPLETION=5000
+    elif [ "$TEST_CASE" == "test_grpc_async_infer_cancellation_before_response_complete_and_process_after_final_response" ]; then
+        export TRITONSERVER_DELAY_GRPC_NOTIFICATION=5000
+        export TRITONSERVER_DELAY_RESPONSE_COMPLETE_EXEC=5000
     fi
 
-    SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=1"
+    SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=2"
     run_server
     if [ "$SERVER_PID" == "0" ]; then
         echo -e "\n***\n*** Failed to start $SERVER\n***"
@@ -123,6 +131,23 @@ for TEST_CASE in "test_grpc_async_infer" \
         cat $SERVER_LOG
         RET=1
     fi
+
+    # Tests "test_grpc_async_infer" and "test_aio_grpc_async_infer" ends
+    # prematurely before state is released.
+    if [[ "$TEST_CASE" != "test_grpc_async_infer" && "$TEST_CASE" != "test_aio_grpc_async_infer" ]]; then
+        count=$(grep -o "StateRelease" $SERVER_LOG | wc -l)
+        state_released=${state_released:=1}
+        if [ $count == 0 ]; then
+            echo -e "\n***\n*** State not released by server on $TEST_CASE\n***"
+            cat $SERVER_LOG
+            RET=1
+        elif [ $count -ne $state_released ]; then
+            echo -e "\n***\n*** Unexpected states released by server on $TEST_CASE. Expected $state_released but released $count.\n***"
+            cat $SERVER_LOG
+            RET=1
+        fi
+        unset state_released
+    fi
     set -e
 
     kill $SERVER_PID
@@ -133,9 +158,15 @@ for TEST_CASE in "test_grpc_async_infer" \
     elif [ "$TEST_CASE" == "test_grpc_async_infer_response_complete_during_cancellation" ]; then
         unset TRITONSERVER_DELAY_GRPC_NOTIFICATION
         unset TRITONSERVER_DELAY_GRPC_ENQUEUE
-    elif [ "$TEST_CASE" == "test_grpc_async_infer_cancellation_during_response_complete" ]; then
+    elif [ "$TEST_CASE" == "test_grpc_async_infer_cancellation_before_finish_0" ]; then
         unset TRITONSERVER_DELAY_GRPC_NOTIFICATION
         unset TRITONSERVER_DELAY_RESPONSE_COMPLETION
+    elif [ "$TEST_CASE" == "test_grpc_async_infer_cancellation_before_finish_1" ]; then
+        unset TRITONSERVER_DELAY_GRPC_PROCESS_ENTRY
+        unset TRITONSERVER_DELAY_RESPONSE_COMPLETION
+    elif [ "$TEST_CASE" == "test_grpc_async_infer_cancellation_before_response_complete_and_process_after_final_response" ]; then
+        unset TRITONSERVER_DELAY_GRPC_NOTIFICATION
+        unset TRITONSERVER_DELAY_RESPONSE_COMPLETE_EXEC
     fi
 done
 
diff --git a/src/grpc/infer_handler.cc b/src/grpc/infer_handler.cc
@@ -1,4 +1,4 @@
-// Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -697,9 +697,23 @@ ModelInferHandler::Process(
         std::chrono::milliseconds(state->delay_process_ms_));
   }
 
+  if (is_notification) {
+    state->context_->SetReceivedNotification(true);
+  }
+
   // Handle notification for cancellation which can be raised
   // asynchronously if detected on the network.
   if (state->IsGrpcContextCancelled()) {
+    if (is_notification) {
+      // Received the cancellation notification
+      LOG_VERBOSE(1) << "Cancellation notification received for " << Name()
+                     << ", rpc_ok=" << rpc_ok << ", context "
+                     << state->context_->unique_id_ << " step "
+                     << state->context_->step_ << ", state "
+                     << state->unique_id_ << " step " << state->step_;
+    }
+
+    bool skip_handle_cancellation = false;
     if (rpc_ok && (state->step_ == Steps::START) &&
         (state->context_->step_ != Steps::CANCELLED)) {
 #ifdef TRITON_ENABLE_TRACING
@@ -715,10 +729,16 @@ ModelInferHandler::Process(
       // thread, and cancellation at step START was not reproducible in a
       // single thread scenario.
       StartNewRequest();
+    } else if (
+        state->step_ == Steps::COMPLETE || state->step_ == Steps::FINISH) {
+      // If the request is completed, simply ignore the cancellation.
+      skip_handle_cancellation = true;
+    }
+
+    if (!skip_handle_cancellation) {
+      bool resume = state->context_->HandleCancellation(state, rpc_ok, Name());
+      return resume;
     }
-    bool resume = state->context_->HandleCancellation(
-        state, rpc_ok, Name(), is_notification);
-    return resume;
   }
 
 
@@ -1023,6 +1043,16 @@ ModelInferHandler::InferResponseComplete(
   // notification.
   std::lock_guard<std::recursive_mutex> lock(state->step_mtx_);
 
+  if (state->delay_response_complete_exec_ms_ != 0) {
+    // Will delay the Process execution of state at step ISSUED by the
+    // specified time. This can be used to test the flow when cancellation
+    // request issued for the request before InferResponseComplete.
+    LOG_INFO << "Delaying InferResponseComplete execution by "
+             << state->delay_response_complete_exec_ms_ << " ms...";
+    std::this_thread::sleep_for(
+        std::chrono::milliseconds(state->delay_response_complete_exec_ms_));
+  }
+
   // Increment the callback index if received valid 'iresponse'
   if (iresponse != nullptr) {
     state->cb_count_++;
diff --git a/src/grpc/infer_handler.h b/src/grpc/infer_handler.h
@@ -730,7 +730,7 @@ class InferHandlerState {
       ctx_->AsyncNotifyWhenDone(notify_state_.get());
     }
 
-    void SetReceivedNotification(bool value) { received_notification_ = true; }
+    void SetReceivedNotification(bool value) { received_notification_ = value; }
 
     bool ReceivedNotification() { return received_notification_; }
 
@@ -860,7 +860,8 @@ class InferHandlerState {
           std::lock_guard<std::recursive_mutex> lock(state->step_mtx_);
           if (state->step_ != Steps::CANCELLED &&
               state->step_ != Steps::COMPLETE) {
-            LOG_VERBOSE(1) << "Issuing cancellation for " << state->unique_id_;
+            LOG_VERBOSE(1) << "Issuing cancellation for " << state->unique_id_
+                           << " step " << state->step_;
             if (state->inference_request_.get() == nullptr) {
               // The context might be holding some states that have
               // not been issued to Triton core. Need to skip calling
@@ -895,8 +896,7 @@ class InferHandlerState {
     // Returns whether or not to continue cycling through the gRPC
     // completion queue or not.
     bool HandleCancellation(
-        InferHandlerStateType* state, bool rpc_ok, const std::string& name,
-        bool is_notification)
+        InferHandlerStateType* state, bool rpc_ok, const std::string& name)
     {
       // Check to avoid early exit in case of triton_grpc_error
       if (!IsCancelled()) {
@@ -908,12 +908,6 @@ class InferHandlerState {
             << " step " << state->step_;
         return true;
       }
-      if (is_notification) {
-        LOG_VERBOSE(1) << "Cancellation notification received for " << name
-                       << ", rpc_ok=" << rpc_ok << ", context "
-                       << state->context_->unique_id_ << ", "
-                       << state->unique_id_ << " step " << state->step_;
-      }
 
       if (state->step_ != Steps::CANCELLATION_ISSUED) {
         // If the context has not been cancelled then
@@ -934,18 +928,6 @@ class InferHandlerState {
           // next iteration from the completion queue which
           // would release the state.
           return true;
-        } else if (is_notification && state->step_ == Steps::CANCELLED) {
-          // A corner case where InferResponseComplete is called between the
-          // cancellation reception but before the cancellation notification
-          // thread enters Process function.
-          // Should let the InferResponseComplete callback trigger the state
-          // release.
-          LOG_VERBOSE(1) << "Waiting for the state enqueued by callback to "
-                            "complete cancellation for "
-                         << name << ", rpc_ok=" << rpc_ok << ", context "
-                         << state->context_->unique_id_ << ", "
-                         << state->unique_id_ << " step " << state->step_;
-          return true;
         } else {
           // The cancellation request has been handled so the state can be
           // released.
@@ -1140,8 +1122,12 @@ class InferHandlerState {
     delay_response_ms_ = ParseDebugVariable("TRITONSERVER_DELAY_GRPC_RESPONSE");
     delay_complete_ms_ = ParseDebugVariable("TRITONSERVER_DELAY_GRPC_COMPLETE");
     delay_process_ms_ = ParseDebugVariable("TRITONSERVER_DELAY_GRPC_PROCESS");
+    delay_process_entry_ms_ =
+        ParseDebugVariable("TRITONSERVER_DELAY_GRPC_PROCESS_ENTRY");
     delay_notification_process_entry_ms_ =
         ParseDebugVariable("TRITONSERVER_DELAY_GRPC_NOTIFICATION");
+    delay_response_complete_exec_ms_ =
+        ParseDebugVariable("TRITONSERVER_DELAY_RESPONSE_COMPLETE_EXEC");
     delay_enqueue_ms_ = ParseDebugVariable("TRITONSERVER_DELAY_GRPC_ENQUEUE");
     delay_response_completion_ms_ =
         ParseDebugVariable("TRITONSERVER_DELAY_RESPONSE_COMPLETION");
@@ -1269,7 +1255,9 @@ class InferHandlerState {
   int delay_response_ms_;
   int delay_complete_ms_;
   int delay_process_ms_;
+  int delay_process_entry_ms_;
   int delay_notification_process_entry_ms_;
+  int delay_response_complete_exec_ms_;
   int delay_enqueue_ms_;
   int delay_response_completion_ms_;
 
@@ -1503,7 +1491,6 @@ InferHandler<
       if (state->step_ == Steps::WAITING_NOTIFICATION) {
         State* state_wrapper = state;
         state = state_wrapper->state_ptr_;
-        state->context_->SetReceivedNotification(true);
         is_notification = true;
         LOG_VERBOSE(1) << "Received notification for " << Name() << ", "
                        << state->unique_id_;
@@ -1522,14 +1509,25 @@ InferHandler<
           std::this_thread::sleep_for(std::chrono::milliseconds(
               state->delay_notification_process_entry_ms_));
         }
+      } else {
+        if (state->delay_process_entry_ms_ != 0) {
+          // Will delay the entry to Process by the specified time.
+          LOG_INFO << "Delaying the entry to Process thread by "
+                   << state->delay_process_entry_ms_ << " ms...";
+          std::this_thread::sleep_for(
+              std::chrono::milliseconds(state->delay_process_entry_ms_));
+        }
       }
+
       LOG_VERBOSE(2) << "Grpc::CQ::Next() "
                      << state->context_->DebugString(state);
       if (!Process(state, ok, is_notification)) {
         LOG_VERBOSE(1) << "Done for " << Name() << ", " << state->unique_id_;
         state->context_->EraseState(state);
         StateRelease(state);
       } else {
+        // In non-streaming infer mode which has multiple request handlers,
+        // there is no guarantee state->context_ is valid beyond this line.
         LOG_VERBOSE(2) << "Returning from " << Name() << ", "
                        << state->unique_id_ << ", " << state->step_;
       }
diff --git a/src/grpc/stream_infer_handler.cc b/src/grpc/stream_infer_handler.cc
@@ -1,4 +1,4 @@
-// Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -133,6 +133,9 @@ bool
 ModelStreamInferHandler::Process(
     InferHandler::State* state, bool rpc_ok, bool is_notification)
 {
+  if (is_notification) {
+    state->context_->SetReceivedNotification(true);
+  }
   // Because gRPC doesn't allow concurrent writes on the
   // the stream we only have a single handler thread that
   // reads from the completion queue. Hence, cancellation
@@ -144,8 +147,16 @@ ModelStreamInferHandler::Process(
   if (state->context_->ReceivedNotification()) {
     std::lock_guard<std::recursive_mutex> lock(state->step_mtx_);
     if (state->IsGrpcContextCancelled()) {
-      bool resume = state->context_->HandleCancellation(
-          state, rpc_ok, Name(), is_notification);
+      if (is_notification) {
+        // This is the cancellation notification
+        LOG_VERBOSE(1) << "Cancellation notification received for " << Name()
+                       << ", rpc_ok=" << rpc_ok << ", context "
+                       << state->context_->unique_id_ << " step "
+                       << state->context_->step_ << ", state "
+                       << state->unique_id_ << " step " << state->step_;
+      }
+
+      bool resume = state->context_->HandleCancellation(state, rpc_ok, Name());
       return resume;
     } else {
       if (state->context_->HandleCompletion()) {
diff --git a/src/grpc/stream_infer_handler.h b/src/grpc/stream_infer_handler.h
@@ -106,8 +106,7 @@ class ModelStreamInferHandler
 
  protected:
   void StartNewRequest() override;
-  bool Process(
-      State* state, bool rpc_ok, bool is_notification = false) override;
+  bool Process(State* state, bool rpc_ok, bool is_notification) override;
 
  private:
   static void StreamInferResponseComplete(