Skip to content

Commit 25baa7b

Browse files
authored
fix: Fix segfaults in tracing mode after long run (#8144)
1 parent ef52c84 commit 25baa7b

File tree

3 files changed

+210
-10
lines changed

3 files changed

+210
-10
lines changed

qa/L0_trace/test.sh

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,11 @@ mkdir -p $MODELSDIR/custom_identity_int32/1 && (cd $MODELSDIR/custom_identity_in
109109

110110
RET=0
111111

112+
# set up identity_fp32 model
113+
mkdir -p $MODELSDIR/identity_fp32/1 && \
114+
cp ../python_models/identity_fp32/model.py $MODELSDIR/identity_fp32/1/. && \
115+
cp ../python_models/identity_fp32/config.pbtxt $MODELSDIR/identity_fp32/.
116+
112117
# Helpers =======================================
113118
function assert_curl_success {
114119
message="${1}"
@@ -186,6 +191,26 @@ function send_inference_requests {
186191
done
187192
}
188193

194+
function run_stress_client {
195+
stress_client="${1}"
196+
client_log="${2}"
197+
echo "Running stress test for 120 seconds..."
198+
bash -c '
199+
# Handle SIGTERM (signal 15) and exit gracefully
200+
trap "echo \"cleaning up stress client...\"; exit 0" SIGTERM
201+
202+
while true; do
203+
python3 "$1" >> "$2"
204+
sleep 0.1
205+
done' _ "$stress_client" "$client_log" & CLIENT_PID=$!
206+
sleep 120
207+
208+
set -e
209+
kill $CLIENT_PID
210+
wait $CLIENT_PID
211+
set +e
212+
}
213+
189214
#=======================================
190215

191216
# start with trace-level=OFF
@@ -1244,4 +1269,78 @@ set -e
12441269
kill $SERVER_PID
12451270
wait $SERVER_PID
12461271
set +e
1272+
1273+
# Long running stress test
1274+
# Triton trace mode
1275+
SERVER_ARGS="--model-control-mode=explicit \
1276+
--model-repository=$MODELSDIR \
1277+
--load-model=identity_fp32 \
1278+
--trace-config mode=triton \
1279+
--trace-config triton,file=./trace \
1280+
--trace-config rate=1 \
1281+
--trace-config level=TIMESTAMPS"
1282+
SERVER_LOG="./inference_server_triton_trace_stress.log"
1283+
CLIENT_LOG="./client_triton_trace_stress.log"
1284+
STRESS_CLIENT="./trace_stress_grpc_client.py"
1285+
1286+
run_server
1287+
if [ "$SERVER_PID" == "0" ]; then
1288+
echo -e "\n***\n*** Failed to start $SERVER\n***"
1289+
cat $SERVER_LOG
1290+
exit 1
1291+
fi
1292+
1293+
# Run stress test
1294+
run_stress_client $STRESS_CLIENT $CLIENT_LOG
1295+
1296+
set -e
1297+
if ! kill -0 ${SERVER_PID} > /dev/null 2>&1; then
1298+
echo -e "\n***\n*** Server stopped unexpectedly during stress test\n***"
1299+
cat $SERVER_LOG
1300+
RET=1
1301+
else
1302+
kill $SERVER_PID
1303+
wait $SERVER_PID
1304+
fi
1305+
set +e
1306+
1307+
# Opentelemetry trace mode
1308+
SERVER_ARGS="--model-control-mode=explicit \
1309+
--model-repository=$MODELSDIR \
1310+
--load-model=identity_fp32 \
1311+
--trace-config level=TIMESTAMPS \
1312+
--trace-config rate=1 \
1313+
--trace-config mode=opentelemetry \
1314+
--trace-config opentelemetry,resource=test.key=test.value \
1315+
--trace-config opentelemetry,resource=service.name=test_triton \
1316+
--trace-config opentelemetry,url=localhost:$OTLP_PORT/v1/traces"
1317+
SERVER_LOG="./inference_server_otel_trace_stress.log"
1318+
CLIENT_LOG="./client_otel_trace_stress.log"
1319+
STRESS_CLIENT="./trace_stress_grpc_client.py"
1320+
1321+
run_server
1322+
if [ "$SERVER_PID" == "0" ]; then
1323+
echo -e "\n***\n*** Failed to start $SERVER\n***"
1324+
cat $SERVER_LOG
1325+
exit 1
1326+
fi
1327+
1328+
rm collected_traces.json
1329+
$OTEL_COLLECTOR --config ./trace-config.yaml >> $OTEL_COLLECTOR_LOG 2>&1 & COLLECTOR_PID=$!
1330+
# Run stress test
1331+
run_stress_client $STRESS_CLIENT $CLIENT_LOG
1332+
1333+
set -e
1334+
kill $COLLECTOR_PID
1335+
wait $COLLECTOR_PID
1336+
if ! kill -0 ${SERVER_PID} > /dev/null 2>&1; then
1337+
echo -e "\n***\n*** Server stopped unexpectedly during stress test\n***"
1338+
cat $SERVER_LOG
1339+
RET=1
1340+
else
1341+
kill $SERVER_PID
1342+
wait $SERVER_PID
1343+
fi
1344+
set +e
1345+
12471346
exit $RET
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
#!/usr/bin/env python
2+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
3+
#
4+
# Redistribution and use in source and binary forms, with or without
5+
# modification, are permitted provided that the following conditions
6+
# are met:
7+
# * Redistributions of source code must retain the above copyright
8+
# notice, this list of conditions and the following disclaimer.
9+
# * Redistributions in binary form must reproduce the above copyright
10+
# notice, this list of conditions and the following disclaimer in the
11+
# documentation and/or other materials provided with the distribution.
12+
# * Neither the name of NVIDIA CORPORATION nor the names of its
13+
# contributors may be used to endorse or promote products derived
14+
# from this software without specific prior written permission.
15+
#
16+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
17+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
20+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23+
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
24+
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27+
28+
import random
29+
import sys
30+
import time
31+
from functools import partial
32+
33+
import numpy as np
34+
import tritonclient.grpc as grpcclient
35+
36+
if __name__ == "__main__":
37+
# 1 ms cancellation timeout
38+
client_timeout = 1
39+
url = "localhost:8001"
40+
41+
try:
42+
triton_client = grpcclient.InferenceServerClient(url=url)
43+
except Exception as e:
44+
print("context creation failed: " + str(e))
45+
sys.exit()
46+
47+
model_name = "identity_fp32"
48+
49+
# Infer
50+
inputs = []
51+
52+
input_data = np.array(
53+
[random.random() for i in range(50)], dtype=np.float32
54+
).reshape(1, -1)
55+
model_input = grpcclient.InferInput(
56+
name="INPUT0", datatype="FP32", shape=input_data.shape
57+
)
58+
model_input.set_data_from_numpy(input_data)
59+
inputs.append(model_input)
60+
61+
# Define the callback function. Note the last two parameters should be
62+
# result and error. InferenceServerClient would povide the results of an
63+
# inference as grpcclient.InferResult in result. For successful
64+
# inference, error will be None, otherwise it will be an object of
65+
# tritonclientutils.InferenceServerException holding the error details
66+
def callback(user_data, result, error):
67+
if error:
68+
user_data.append(error)
69+
else:
70+
user_data.append(result)
71+
72+
# list to hold the results of inference.
73+
user_data = []
74+
75+
# Inference call
76+
for _ in range(1000):
77+
triton_client.async_infer(
78+
model_name=model_name,
79+
inputs=inputs,
80+
callback=partial(callback, user_data),
81+
client_timeout=client_timeout,
82+
)
83+
84+
# Wait until the results are available in user_data
85+
time_out = 20
86+
while (len(user_data) == 0) and time_out > 0:
87+
time_out = time_out - 1
88+
time.sleep(1)
89+
90+
print("results: ", len(user_data))

src/tracer.cc

Lines changed: 21 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
1+
// Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
//
33
// Redistribution and use in source and binary forms, with or without
44
// modification, are permitted provided that the following conditions
@@ -740,16 +740,27 @@ TraceManager::TraceRelease(TRITONSERVER_InferenceTrace* trace, void* userp)
740740
LOG_TRITONSERVER_ERROR(
741741
TRITONSERVER_InferenceTraceId(trace, &id), "getting trace id");
742742

743-
auto ts = reinterpret_cast<std::shared_ptr<TraceManager::Trace>*>(userp);
744-
std::lock_guard<std::mutex> lk((*ts)->mtx_);
745-
(*ts)->spawned_traces_tracker_.erase(id);
746-
// The userp will be shared with the trace children, so only delete it
747-
// if no more TraceRelease calls are expected
748-
if ((*ts)->spawned_traces_tracker_.empty()) {
749-
delete ts;
743+
auto ts_ptr = reinterpret_cast<std::shared_ptr<TraceManager::Trace>*>(userp);
744+
std::shared_ptr<TraceManager::Trace> tracer_sp;
745+
bool delete_ts = false;
746+
{
747+
std::lock_guard<std::mutex> lk((*ts_ptr)->mtx_);
748+
(*ts_ptr)->spawned_traces_tracker_.erase(id);
749+
// The userp will be shared with the trace children, so only delete it
750+
// if no more TraceRelease calls are expected
751+
if ((*ts_ptr)->spawned_traces_tracker_.empty()) {
752+
// Move the trace shared_ptr out inside lock to ensure mutex stays alive
753+
// and destruct outside lock
754+
tracer_sp = std::move(*ts_ptr);
755+
delete_ts = true;
756+
delete ts_ptr;
757+
}
758+
LOG_TRITONSERVER_ERROR(
759+
TRITONSERVER_InferenceTraceDelete(trace), "deleting trace");
760+
}
761+
if (delete_ts) {
762+
tracer_sp.reset();
750763
}
751-
LOG_TRITONSERVER_ERROR(
752-
TRITONSERVER_InferenceTraceDelete(trace), "deleting trace");
753764
}
754765

755766
const char*

0 commit comments

Comments
 (0)