Skip to content

Commit 3bca828

Browse files
mc-nvindrajit96
andauthored
test: Add test for ORCA (#8009) (#8112)
Co-authored-by: Indrajit Bhosale <iamindrajitb@gmail.com>
1 parent 37de29d commit 3bca828

File tree

2 files changed

+335
-0
lines changed

2 files changed

+335
-0
lines changed

qa/L0_orca/orca_http_test.py

Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
#!/usr/bin/python3
2+
# Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
#
4+
# Redistribution and use in source and binary forms, with or without
5+
# modification, are permitted provided that the following conditions
6+
# are met:
7+
# * Redistributions of source code must retain the above copyright
8+
# notice, this list of conditions and the following disclaimer.
9+
# * Redistributions in binary form must reproduce the above copyright
10+
# notice, this list of conditions and the following disclaimer in the
11+
# documentation and/or other materials provided with the distribution.
12+
# * Neither the name of NVIDIA CORPORATION nor the names of its
13+
# contributors may be used to endorse or promote products derived
14+
# from this software without specific prior written permission.
15+
#
16+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
17+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
20+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23+
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
24+
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27+
28+
import argparse
29+
import json
30+
import sys
31+
32+
import requests
33+
34+
35+
# To run the test, have tritonserver running and run this script with the endpoint as a flag.
36+
#
37+
# Example:
38+
# ```
39+
# python3 orca_header_test.py http://localhost:8000/v2/models/ensemble/generate
40+
# ```
41+
def get_endpoint_header(url, data, request_header=None):
42+
"""
43+
Sends a POST request to the given URL with the provided data and returns the value of the "endpoint-load-metrics" header,
44+
or None if the request fails.
45+
"""
46+
HEADER_KEY = "endpoint-load-metrics"
47+
try:
48+
response = None
49+
if request_header:
50+
response = requests.post(url, json=data, headers=request_header)
51+
else:
52+
response = requests.post(url, json=data)
53+
response.raise_for_status()
54+
return response.headers.get(HEADER_KEY, "")
55+
except requests.exceptions.RequestException as e:
56+
print(f"Error making request: {e}")
57+
return None
58+
59+
60+
def parse_header_data(header, orca_format):
61+
"""
62+
Parses the header data into a dictionary based on the given format.
63+
"""
64+
METRIC_KEY = "named_metrics"
65+
try:
66+
if orca_format == "json":
67+
# Parse the header in JSON format
68+
data = json.loads(header.replace("JSON ", ""))
69+
if METRIC_KEY in data:
70+
return data[METRIC_KEY]
71+
else:
72+
print(f"No key '{METRIC_KEY}' in header data: {data}")
73+
return None
74+
elif orca_format == "text":
75+
# Parse the header in TEXT format
76+
data = {}
77+
for key_value_pair in header.replace("TEXT ", "").split(", "):
78+
key, value = key_value_pair.split("=")
79+
if "." in key:
80+
prefix, nested_key = key.split(".", 1)
81+
if prefix == METRIC_KEY:
82+
data[nested_key] = float(value)
83+
if not data:
84+
print(f"Could not parse any keys from header: {header}")
85+
return None
86+
return data
87+
else:
88+
print(f"Invalid ORCA format: {orca_format}")
89+
return None
90+
except (json.JSONDecodeError, ValueError, KeyError):
91+
print("Error: Invalid data in the header.")
92+
return None
93+
94+
95+
def check_for_keys(data, desired_keys, orca_format):
96+
"""
97+
Checks if all desired keys are present in the given data dictionary.
98+
"""
99+
if all(key in data for key in desired_keys):
100+
print(
101+
f"ORCA header present in {orca_format} format with kv_cache_utilization: {[f'{k}: {data[k]}' for k in desired_keys]}"
102+
)
103+
return True
104+
else:
105+
print(f"Missing keys in header: {', '.join(set(desired_keys) - set(data))}")
106+
return False
107+
108+
109+
def request_header(orca_format):
110+
return {"endpoint-load-metrics-format": orca_format} if orca_format else None
111+
112+
113+
def test_header_type(url, data, orca_format):
114+
req_header = request_header(orca_format)
115+
response_header = get_endpoint_header(args.url, TEST_DATA, req_header)
116+
117+
desired_keys = {
118+
"kv_cache_utilization",
119+
"max_token_capacity",
120+
} # Just the keys, no need to initialize with None
121+
122+
if response_header is None:
123+
print(f"Request to endpoint: '{args.url}' failed.")
124+
return False
125+
elif response_header == "":
126+
if orca_format:
127+
print(
128+
f"response header empty, endpoint-load-metrics-format={orca_format} is not a valid ORCA metric format"
129+
)
130+
return False
131+
else:
132+
# No request header set <=> no response header. Intended behavior.
133+
print(f"response header empty, endpoint-load-metrics-format is not set")
134+
return True
135+
136+
data = parse_header_data(response_header, orca_format)
137+
if data:
138+
return check_for_keys(data, desired_keys, orca_format)
139+
else:
140+
print(f"Unexpected response header value: {response_header}")
141+
return False
142+
143+
144+
if __name__ == "__main__":
145+
parser = argparse.ArgumentParser(
146+
description="Make a POST request to generate endpoint to test the ORCA metrics header."
147+
)
148+
parser.add_argument("url", help="The model URL to send the request to.")
149+
args = parser.parse_args()
150+
TEST_DATA = json.loads(
151+
'{"text_input": "hello world", "max_tokens": 20, "bad_words": "", "stop_words": ""}'
152+
)
153+
passed = True
154+
155+
for format in ["json", "text", None]:
156+
print("Checking response header for ORCA format:", format)
157+
if not test_header_type(args.url, TEST_DATA, format):
158+
print("FAIL on format:", format)
159+
passed = False
160+
161+
sys.exit(0 if passed else 1)

qa/L0_orca/test.sh

Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,174 @@
1+
#!/bin/bash
2+
# Copyright 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
#
4+
# Redistribution and use in source and binary forms, with or without
5+
# modification, are permitted provided that the following conditions
6+
# are met:
7+
# * Redistributions of source code must retain the above copyright
8+
# notice, this list of conditions and the following disclaimer.
9+
# * Redistributions in binary form must reproduce the above copyright
10+
# notice, this list of conditions and the following disclaimer in the
11+
# documentation and/or other materials provided with the distribution.
12+
# * Neither the name of NVIDIA CORPORATION nor the names of its
13+
# contributors may be used to endorse or promote products derived
14+
# from this software without specific prior written permission.
15+
#
16+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
17+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
20+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23+
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
24+
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27+
28+
RET=0
29+
BASE_DIR=$(pwd)
30+
NUM_GPUS=${NUM_GPUS:=1}
31+
TENSORRTLLM_BACKEND_REPO_TAG=${TENSORRTLLM_BACKEND_REPO_TAG:="main"}
32+
TRITON_REPO_ORG=${TRITON_REPO_ORG:="https://github.com/triton-inference-server"}
33+
TRT_ROOT="/usr/local/tensorrt"
34+
35+
MODEL_NAME="gpt2_tensorrt_llm"
36+
NAME="tensorrt_llm_benchmarking_test"
37+
MODEL_REPOSITORY="$(pwd)/triton_model_repo"
38+
TENSORRTLLM_BACKEND_DIR="/workspace/tensorrtllm_backend"
39+
GPT_DIR="$TENSORRTLLM_BACKEND_DIR/tensorrt_llm/examples/gpt"
40+
TOKENIZER_DIR="$GPT_DIR/gpt2"
41+
ENGINES_DIR="${BASE_DIR}/engines/inflight_batcher_llm/${NUM_GPUS}-gpu"
42+
TRITON_DIR=${TRITON_DIR:="/opt/tritonserver"}
43+
SERVER=${TRITON_DIR}/bin/tritonserver
44+
BACKEND_DIR=${TRITON_DIR}/backends
45+
SERVER_LOG="${NAME}_server.log"
46+
SERVER_TIMEOUT=${SERVER_TIMEOUT:=120}
47+
CLIENT_PY=${BASE_DIR}/orca_http_test.py
48+
CLIENT_LOG="${NAME}_orca_http_test.log"
49+
source ../common/util.sh
50+
51+
function prepare_model_repository {
52+
rm -rf ${MODEL_REPOSITORY} && mkdir ${MODEL_REPOSITORY}
53+
cp -r ${TENSORRTLLM_BACKEND_DIR}/all_models/inflight_batcher_llm/* ${MODEL_REPOSITORY}
54+
rm -rf ${MODEL_REPOSITORY}/tensorrt_llm_bls
55+
mv "${MODEL_REPOSITORY}/ensemble" "${MODEL_REPOSITORY}/${MODEL_NAME}"
56+
57+
replace_config_tags "model_version: -1" "model_version: 1" "${MODEL_REPOSITORY}/${MODEL_NAME}/config.pbtxt"
58+
replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_REPOSITORY}/${MODEL_NAME}/config.pbtxt"
59+
replace_config_tags 'name: "ensemble"' "name: \"$MODEL_NAME\"" "${MODEL_REPOSITORY}/${MODEL_NAME}/config.pbtxt"
60+
replace_config_tags '${logits_datatype}' "TYPE_FP32" "${MODEL_REPOSITORY}/${MODEL_NAME}/config.pbtxt"
61+
62+
replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_REPOSITORY}/preprocessing/config.pbtxt"
63+
replace_config_tags '${preprocessing_instance_count}' '1' "${MODEL_REPOSITORY}/preprocessing/config.pbtxt"
64+
replace_config_tags '${tokenizer_dir}' "${TOKENIZER_DIR}/" "${MODEL_REPOSITORY}/preprocessing/config.pbtxt"
65+
replace_config_tags '${logits_datatype}' "TYPE_FP32" "${MODEL_REPOSITORY}/preprocessing/config.pbtxt"
66+
replace_config_tags '${max_queue_delay_microseconds}' "1000000" "${MODEL_REPOSITORY}/preprocessing/config.pbtxt"
67+
replace_config_tags '${max_queue_size}' "0" "${MODEL_REPOSITORY}/preprocessing/config.pbtxt"
68+
69+
replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_REPOSITORY}/postprocessing/config.pbtxt"
70+
replace_config_tags '${postprocessing_instance_count}' '1' "${MODEL_REPOSITORY}/postprocessing/config.pbtxt"
71+
replace_config_tags '${tokenizer_dir}' "${TOKENIZER_DIR}/" "${MODEL_REPOSITORY}/postprocessing/config.pbtxt"
72+
replace_config_tags '${logits_datatype}' "TYPE_FP32" "${MODEL_REPOSITORY}/postprocessing/config.pbtxt"
73+
74+
replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt"
75+
replace_config_tags '${decoupled_mode}' 'true' "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt"
76+
replace_config_tags '${max_queue_delay_microseconds}' "1000000" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt"
77+
replace_config_tags '${batching_strategy}' 'inflight_fused_batching' "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt"
78+
replace_config_tags '${engine_dir}' "${ENGINES_DIR}" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt"
79+
replace_config_tags '${triton_backend}' "tensorrtllm" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt"
80+
replace_config_tags '${max_queue_size}' "0" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt"
81+
replace_config_tags '${logits_datatype}' "TYPE_FP32" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt"
82+
replace_config_tags '${encoder_input_features_data_type}' "TYPE_FP32" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt"
83+
}
84+
85+
# Wait until server health endpoint shows ready. Sets WAIT_RET to 0 on
86+
# success, 1 on failure
87+
function wait_for_server_ready() {
88+
local wait_time_secs="${1:-30}"
89+
shift
90+
local spids=("$@")
91+
92+
WAIT_RET=0
93+
94+
for _ in $(seq "$wait_time_secs"); do
95+
for pid in "${spids[@]}"; do
96+
if ! kill -0 "$pid" >/dev/null 2>&1; then
97+
echo "=== Server not running."
98+
WAIT_RET=1
99+
return
100+
fi
101+
done
102+
103+
sleep 1
104+
105+
if curl -s --fail localhost:8000/v2/health/ready &&
106+
curl -s --fail -w "%{http_code}" -o /dev/null -d '{"log_verbose_level":1}' localhost:8000/v2/logging; then
107+
return
108+
fi
109+
done
110+
111+
echo "=== Timeout $wait_time_secs secs. Server not ready."
112+
WAIT_RET=1
113+
}
114+
115+
function run_server {
116+
python3 ${TENSORRTLLM_BACKEND_DIR}/scripts/launch_triton_server.py --world_size="${NUM_GPUS}" --model_repo="${MODEL_REPOSITORY}" >${SERVER_LOG} 2>&1 &
117+
sleep 2 # allow time to obtain the pid(s)
118+
# Read PIDs into an array, trimming whitespaces
119+
readarray -t SERVER_PID < <(pgrep "tritonserver")
120+
121+
wait_for_server_ready ${SERVER_TIMEOUT} "${SERVER_PID[@]}"
122+
if [ "$WAIT_RET" != "0" ]; then
123+
# Cleanup
124+
kill "${SERVER_PID[@]}" >/dev/null 2>&1 || true
125+
echo -e "\n***\n*** Failed to start $SERVER\n***"
126+
cat $SERVER_LOG
127+
exit 1
128+
fi
129+
}
130+
131+
function kill_server {
132+
pgrep tritonserver | xargs kill -SIGINT
133+
for pid in "${SERVER_PID[@]}"; do
134+
echo "Waiting for proc ${pid} to terminate..."
135+
while kill -0 $pid >/dev/null 2>&1; do
136+
sleep 1
137+
done
138+
done
139+
}
140+
141+
clone_tensorrt_llm_backend_repo
142+
build_gpt2_base_model
143+
build_gpt2_tensorrt_engine
144+
prepare_model_repository
145+
146+
set +e
147+
run_server
148+
149+
if [ "$SERVER_PID" == "0" ]; then
150+
echo -e "\n***\n*** Failed to start $SERVER\n***"
151+
cat $SERVER_LOG
152+
exit 1
153+
fi
154+
155+
RET=0
156+
157+
python3 $CLIENT_PY "http://localhost:8000/v2/models/${MODEL_NAME}/generate" >>$CLIENT_LOG 2>&1
158+
159+
if [ $? -ne 0 ]; then
160+
echo "Failed: Client test had a non-zero return code."
161+
RET=1
162+
fi
163+
164+
if [ $RET -eq 0 ]; then
165+
echo -e "\n***\n*** ORCA Test Passed\n***"
166+
else
167+
cat $SERVER_LOG
168+
cat $CLIENT_LOG
169+
echo -e "\n***\n*** ORCA Test FAILED\n***"
170+
fi
171+
172+
kill_server
173+
set -e
174+
exit $RET

0 commit comments

Comments
 (0)