fix: Fix segfaults in tracing mode after long run (#8144)

yinggeh · web-flow · commit 25baa7b33e68 · 2025-04-14T09:47:23.000-07:00
diff --git a/qa/L0_trace/test.sh b/qa/L0_trace/test.sh
@@ -109,6 +109,11 @@ mkdir -p $MODELSDIR/custom_identity_int32/1 && (cd $MODELSDIR/custom_identity_in
 
 RET=0
 
+# set up identity_fp32 model
+mkdir -p $MODELSDIR/identity_fp32/1 && \
+    cp ../python_models/identity_fp32/model.py $MODELSDIR/identity_fp32/1/. && \
+    cp ../python_models/identity_fp32/config.pbtxt $MODELSDIR/identity_fp32/.
+
 # Helpers =======================================
 function assert_curl_success {
   message="${1}"
@@ -186,6 +191,26 @@ function send_inference_requests {
     done
 }
 
+function run_stress_client {
+    stress_client="${1}"
+    client_log="${2}"
+    echo "Running stress test for 120 seconds..."
+    bash -c '
+        # Handle SIGTERM (signal 15) and exit gracefully
+        trap "echo \"cleaning up stress client...\"; exit 0" SIGTERM
+
+        while true; do
+            python3 "$1" >> "$2"
+            sleep 0.1
+        done' _ "$stress_client" "$client_log" & CLIENT_PID=$!
+    sleep 120
+
+    set -e
+    kill $CLIENT_PID
+    wait $CLIENT_PID
+    set +e
+}
+
 #=======================================
 
 # start with trace-level=OFF
@@ -1244,4 +1269,78 @@ set -e
 kill $SERVER_PID
 wait $SERVER_PID
 set +e
+
+# Long running stress test
+# Triton trace mode
+SERVER_ARGS="--model-control-mode=explicit \
+                --model-repository=$MODELSDIR \
+                --load-model=identity_fp32 \
+                --trace-config mode=triton \
+                --trace-config triton,file=./trace \
+                --trace-config rate=1 \
+                --trace-config level=TIMESTAMPS"
+SERVER_LOG="./inference_server_triton_trace_stress.log"
+CLIENT_LOG="./client_triton_trace_stress.log"
+STRESS_CLIENT="./trace_stress_grpc_client.py"
+
+run_server
+if [ "$SERVER_PID" == "0" ]; then
+    echo -e "\n***\n*** Failed to start $SERVER\n***"
+    cat $SERVER_LOG
+    exit 1
+fi
+
+# Run stress test
+run_stress_client $STRESS_CLIENT $CLIENT_LOG
+
+set -e
+if ! kill -0 ${SERVER_PID} > /dev/null 2>&1; then
+    echo -e "\n***\n*** Server stopped unexpectedly during stress test\n***"
+    cat $SERVER_LOG
+    RET=1
+else
+    kill $SERVER_PID
+    wait $SERVER_PID
+fi
+set +e
+
+# Opentelemetry trace mode
+SERVER_ARGS="--model-control-mode=explicit \
+                --model-repository=$MODELSDIR \
+                --load-model=identity_fp32 \
+                --trace-config level=TIMESTAMPS \
+                --trace-config rate=1 \
+                --trace-config mode=opentelemetry \
+                --trace-config opentelemetry,resource=test.key=test.value \
+                --trace-config opentelemetry,resource=service.name=test_triton \
+                --trace-config opentelemetry,url=localhost:$OTLP_PORT/v1/traces"
+SERVER_LOG="./inference_server_otel_trace_stress.log"
+CLIENT_LOG="./client_otel_trace_stress.log"
+STRESS_CLIENT="./trace_stress_grpc_client.py"
+
+run_server
+if [ "$SERVER_PID" == "0" ]; then
+    echo -e "\n***\n*** Failed to start $SERVER\n***"
+    cat $SERVER_LOG
+    exit 1
+fi
+
+rm collected_traces.json
+$OTEL_COLLECTOR --config ./trace-config.yaml >> $OTEL_COLLECTOR_LOG 2>&1 & COLLECTOR_PID=$!
+# Run stress test
+run_stress_client $STRESS_CLIENT $CLIENT_LOG
+
+set -e
+kill $COLLECTOR_PID
+wait $COLLECTOR_PID
+if ! kill -0 ${SERVER_PID} > /dev/null 2>&1; then
+    echo -e "\n***\n*** Server stopped unexpectedly during stress test\n***"
+    cat $SERVER_LOG
+    RET=1
+else
+    kill $SERVER_PID
+    wait $SERVER_PID
+fi
+set +e
+
 exit $RET
diff --git a/qa/L0_trace/trace_stress_grpc_client.py b/qa/L0_trace/trace_stress_grpc_client.py
@@ -0,0 +1,90 @@
+#!/usr/bin/env python
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import random
+import sys
+import time
+from functools import partial
+
+import numpy as np
+import tritonclient.grpc as grpcclient
+
+if __name__ == "__main__":
+    # 1 ms cancellation timeout
+    client_timeout = 1
+    url = "localhost:8001"
+
+    try:
+        triton_client = grpcclient.InferenceServerClient(url=url)
+    except Exception as e:
+        print("context creation failed: " + str(e))
+        sys.exit()
+
+    model_name = "identity_fp32"
+
+    # Infer
+    inputs = []
+
+    input_data = np.array(
+        [random.random() for i in range(50)], dtype=np.float32
+    ).reshape(1, -1)
+    model_input = grpcclient.InferInput(
+        name="INPUT0", datatype="FP32", shape=input_data.shape
+    )
+    model_input.set_data_from_numpy(input_data)
+    inputs.append(model_input)
+
+    # Define the callback function. Note the last two parameters should be
+    # result and error. InferenceServerClient would povide the results of an
+    # inference as grpcclient.InferResult in result. For successful
+    # inference, error will be None, otherwise it will be an object of
+    # tritonclientutils.InferenceServerException holding the error details
+    def callback(user_data, result, error):
+        if error:
+            user_data.append(error)
+        else:
+            user_data.append(result)
+
+    # list to hold the results of inference.
+    user_data = []
+
+    # Inference call
+    for _ in range(1000):
+        triton_client.async_infer(
+            model_name=model_name,
+            inputs=inputs,
+            callback=partial(callback, user_data),
+            client_timeout=client_timeout,
+        )
+
+    # Wait until the results are available in user_data
+    time_out = 20
+    while (len(user_data) == 0) and time_out > 0:
+        time_out = time_out - 1
+        time.sleep(1)
+
+    print("results: ", len(user_data))
diff --git a/src/tracer.cc b/src/tracer.cc
@@ -1,4 +1,4 @@
-// Copyright 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -740,16 +740,27 @@ TraceManager::TraceRelease(TRITONSERVER_InferenceTrace* trace, void* userp)
   LOG_TRITONSERVER_ERROR(
       TRITONSERVER_InferenceTraceId(trace, &id), "getting trace id");
 
-  auto ts = reinterpret_cast<std::shared_ptr<TraceManager::Trace>*>(userp);
-  std::lock_guard<std::mutex> lk((*ts)->mtx_);
-  (*ts)->spawned_traces_tracker_.erase(id);
-  // The userp will be shared with the trace children, so only delete it
-  // if no more TraceRelease calls are expected
-  if ((*ts)->spawned_traces_tracker_.empty()) {
-    delete ts;
+  auto ts_ptr = reinterpret_cast<std::shared_ptr<TraceManager::Trace>*>(userp);
+  std::shared_ptr<TraceManager::Trace> tracer_sp;
+  bool delete_ts = false;
+  {
+    std::lock_guard<std::mutex> lk((*ts_ptr)->mtx_);
+    (*ts_ptr)->spawned_traces_tracker_.erase(id);
+    // The userp will be shared with the trace children, so only delete it
+    // if no more TraceRelease calls are expected
+    if ((*ts_ptr)->spawned_traces_tracker_.empty()) {
+      // Move the trace shared_ptr out inside lock to ensure mutex stays alive
+      // and destruct outside lock
+      tracer_sp = std::move(*ts_ptr);
+      delete_ts = true;
+      delete ts_ptr;
+    }
+    LOG_TRITONSERVER_ERROR(
+        TRITONSERVER_InferenceTraceDelete(trace), "deleting trace");
+  }
+  if (delete_ts) {
+    tracer_sp.reset();
   }
-  LOG_TRITONSERVER_ERROR(
-      TRITONSERVER_InferenceTraceDelete(trace), "deleting trace");
 }
 
 const char*