Skip to content

Commit 6db7dc2

Browse files
authored
[Benchmark] Refactor perf script to use benchmark cli (#1524)
### What this PR does / why we need it? Since, `vllm bench` cli has optimized enough for production use(support more datasets), we are now do not need to copy vllm codes, now , with vllm installed, we can easily use the benchmark cli ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? CI passed --------- Signed-off-by: wangli <wangli858794774@gmail.com>
1 parent 53ec583 commit 6db7dc2

File tree

5 files changed

+158
-40
lines changed

5 files changed

+158
-40
lines changed

.github/workflows/nightly_benchmarks.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,7 @@ jobs:
115115
env:
116116
PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi
117117
run: |
118+
pip install "transformers<=4.52.4"
118119
pip install -e .
119120
pip install -r benchmarks/requirements-bench.txt
120121

benchmarks/README.md

Lines changed: 120 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,27 @@
11
# Introduction
2-
This document outlines the benchmarking methodology for vllm-ascend, aimed at evaluating the performance under a variety of workloads. The primary goal is to help developers assess whether their pull requests improve or degrade vllm-ascend's performance. To maintain alignment with vLLM, we use the [benchmark](https://github.com/vllm-project/vllm/tree/main/benchmarks) script provided by the vllm project.
2+
This document outlines the benchmarking methodology for vllm-ascend, aimed at evaluating the performance under a variety of workloads. The primary goal is to help developers assess whether their pull requests improve or degrade vllm-ascend's performance.
33

44
# Overview
55
**Benchmarking Coverage**: We measure latency, throughput, and fixed-QPS serving on the Atlas800I A2 (see [quick_start](../docs/source/quick_start.md) to learn more supported devices list), with different models(coming soon).
66
- Latency tests
77
- Input length: 32 tokens.
88
- Output length: 128 tokens.
99
- Batch size: fixed (8).
10-
- Models: Meta-Llama-3.1-8B-Instruct, Qwen2.5-7B-Instruct.
10+
- Models: Qwen2.5-7B-Instruct, Qwen3-8B.
1111
- Evaluation metrics: end-to-end latency (mean, median, p99).
1212

1313
- Throughput tests
1414
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
1515
- Output length: the corresponding output length of these 200 prompts.
1616
- Batch size: dynamically determined by vllm to achieve maximum throughput.
17-
- Models: Meta-Llama-3.1-8B-Instruct, Qwen2.5-7B-Instruct.
17+
- Models: Qwen2.5-VL-7B-Instruct, Qwen2.5-7B-Instruct, Qwen3-8B.
1818
- Evaluation metrics: throughput.
1919
- Serving tests
2020
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
2121
- Output length: the corresponding output length of these 200 prompts.
2222
- Batch size: dynamically determined by vllm and the arrival pattern of the requests.
2323
- **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
24-
- Models: Meta-Llama-3.1-8B-Instruct, Qwen2.5-7B-Instruct.
24+
- Models: Qwen2.5-VL-7B-Instruct, Qwen2.5-7B-Instruct, Qwen3-8B.
2525
- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
2626

2727
**Benchmarking Duration**: about 800 senond for single model.
@@ -38,20 +38,129 @@ Before running the benchmarks, ensure the following:
3838
pip install -r benchmarks/requirements-bench.txt
3939
```
4040
41-
- For performance benchmark, it is recommended to set the [load-format](https://github.com/vllm-project/vllm-ascend/blob/5897dc5bbe321ca90c26225d0d70bff24061d04b/benchmarks/tests/latency-tests.json#L7) as `dummy`, It will construct random weights based on the passed model without downloading the weights from internet, which can greatly reduce the benchmark time. feel free to add your own models and parameters in the JSON to run your customized benchmarks.
41+
- For performance benchmark, it is recommended to set the [load-format](https://github.com/vllm-project/vllm-ascend/blob/5897dc5bbe321ca90c26225d0d70bff24061d04b/benchmarks/tests/latency-tests.json#L7) as `dummy`, It will construct random weights based on the passed model without downloading the weights from internet, which can greatly reduce the benchmark time.
42+
- If you want to run benchmark customized, feel free to add your own models and parameters in the [JSON](https://github.com/vllm-project/vllm-ascend/tree/main/benchmarks/tests), let's take `Qwen2.5-VL-7B-Instruct`as an example:
43+
44+
```shell
45+
[
46+
{
47+
"test_name": "serving_qwen2_5vl_7B_tp1",
48+
"qps_list": [
49+
1,
50+
4,
51+
16,
52+
"inf"
53+
],
54+
"server_parameters": {
55+
"model": "Qwen/Qwen2.5-VL-7B-Instruct",
56+
"tensor_parallel_size": 1,
57+
"swap_space": 16,
58+
"disable_log_stats": "",
59+
"disable_log_requests": "",
60+
"trust_remote_code": "",
61+
"max_model_len": 16384
62+
},
63+
"client_parameters": {
64+
"model": "Qwen/Qwen2.5-VL-7B-Instruct",
65+
"backend": "openai-chat",
66+
"dataset_name": "hf",
67+
"hf_split": "train",
68+
"endpoint": "/v1/chat/completions",
69+
"dataset_path": "lmarena-ai/vision-arena-bench-v0.1",
70+
"num_prompts": 200
71+
}
72+
}
73+
]
74+
```
75+
this Json will be structured and parsed into server parameters and client parameters by the benchmark script. This configuration defines a test case named `serving_qwen2_5vl_7B_tp1`, designed to evaluate the performance of the `Qwen/Qwen2.5-VL-7B-Instruct` model under different request rates. The test includes both server and client parameters, for more parameters details, see vllm benchmark [cli](https://github.com/vllm-project/vllm/tree/main/vllm/benchmarks).
76+
77+
- **Test Overview**
78+
- Test Name: serving_qwen2_5vl_7B_tp1
79+
80+
- Queries Per Second (QPS): The test is run at four different QPS levels: 1, 4, 16, and inf (infinite load, typically used for stress testing).
81+
82+
- Server Parameters
83+
- Model: Qwen/Qwen2.5-VL-7B-Instruct
84+
85+
- Tensor Parallelism: 1 (no model parallelism is used; the model runs on a single device or node)
86+
87+
- Swap Space: 16 GB (used to handle memory overflow by swapping to disk)
88+
89+
- disable_log_stats: disables logging of performance statistics.
90+
91+
- disable_log_requests: disables logging of individual requests.
92+
93+
- Trust Remote Code: enabled (allows execution of model-specific custom code)
94+
95+
- Max Model Length: 16,384 tokens (maximum context length supported by the model)
96+
97+
- Client Parameters
98+
99+
- Model: Qwen/Qwen2.5-VL-7B-Instruct (same as the server)
100+
101+
- Backend: openai-chat (suggests the client uses the OpenAI-compatible chat API format)
102+
103+
- Dataset Source: Hugging Face (hf)
104+
105+
- Dataset Split: train
106+
107+
- Endpoint: /v1/chat/completions (the REST API endpoint to which chat requests are sent)
108+
109+
- Dataset Path: lmarena-ai/vision-arena-bench-v0.1 (the benchmark dataset used for evaluation, hosted on Hugging Face)
110+
111+
- Number of Prompts: 200 (the total number of prompts used during the test)
112+
113+
42114

43115
## Run benchmarks
116+
117+
### Use benchmark script
44118
The provided scripts automatically execute performance tests for serving, throughput, and latency. To start the benchmarking process, run command in the vllm-ascend root directory:
45119
```
46120
bash benchmarks/scripts/run-performance-benchmarks.sh
47121
```
48122
Once the script completes, you can find the results in the benchmarks/results folder. The output files may resemble the following:
49123
```
50-
|-- latency_llama8B_tp1.json
51-
|-- serving_llama8B_tp1_sharegpt_qps_1.json
52-
|-- serving_llama8B_tp1_sharegpt_qps_16.json
53-
|-- serving_llama8B_tp1_sharegpt_qps_4.json
54-
|-- serving_llama8B_tp1_sharegpt_qps_inf.json
55-
|-- throughput_llama8B_tp1.json
124+
.
125+
|-- serving_qwen2_5_7B_tp1_qps_1.json
126+
|-- serving_qwen2_5_7B_tp1_qps_16.json
127+
|-- serving_qwen2_5_7B_tp1_qps_4.json
128+
|-- serving_qwen2_5_7B_tp1_qps_inf.json
129+
|-- latency_qwen2_5_7B_tp1.json
130+
|-- throughput_qwen2_5_7B_tp1.json
56131
```
57132
These files contain detailed benchmarking results for further analysis.
133+
134+
### Use benchmark cli
135+
136+
For more flexible and customized use, benchmark cli is also provided to run online/offline benchmarks
137+
Similarly, let’s take `Qwen2.5-VL-7B-Instruct` benchmark as an example:
138+
#### Online serving
139+
1. Launch the server:
140+
```shell
141+
vllm serve Qwen2.5-VL-7B-Instruct --max-model-len 16789
142+
```
143+
2. Running performance tests using cli
144+
```shell
145+
vllm bench serve --model Qwen2.5-VL-7B-Instruct\
146+
--endpoint-type "openai-chat" --dataset-name hf \
147+
--hf-split train --endpoint "/v1/chat/completions" \
148+
--dataset-path "lmarena-ai/vision-arena-bench-v0.1" \
149+
--num-prompts 200 \
150+
--request-rate 16
151+
```
152+
153+
#### Offline
154+
- **Throughput**
155+
```shell
156+
vllm bench throughput --output-json results/throughput_qwen2_5_7B_tp1.json \
157+
--model Qwen/Qwen2.5-7B-Instruct --tensor-parallel-size 1 --load-format dummy \
158+
--dataset-path /github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json \
159+
--num-prompts 200 --backend vllm
160+
```
161+
- **Latency**
162+
```shell
163+
vllm bench latency --output-json results/latency_qwen2_5_7B_tp1.json \
164+
--model Qwen/Qwen2.5-7B-Instruct --tensor-parallel-size 1 \
165+
--load-format dummy --num-iters-warmup 5 --num-iters 15
166+
```

benchmarks/scripts/patch_benchmark_dataset.py

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import os
12
from argparse import ArgumentParser
23

34
import libcst as cst
@@ -44,25 +45,33 @@ def leave_Call(self, original_node, updated_node):
4445

4546

4647
def patch_file(path):
47-
with open(path, "r", encoding="utf-8") as f:
48+
abs_path = os.path.abspath(path)
49+
50+
if not os.path.exists(abs_path):
51+
print(f"File not found: {abs_path}")
52+
return
53+
54+
with open(abs_path, "r", encoding="utf-8") as f:
4855
source = f.read()
4956

5057
module = cst.parse_module(source)
5158
modified = module.visit(StreamingFalseTransformer())
5259

53-
with open(path, "w", encoding="utf-8") as f:
60+
with open(abs_path, "w", encoding="utf-8") as f:
5461
f.write(modified.code)
5562

56-
print(f"Patched: {path}")
63+
print(f"Patched: {abs_path}")
5764

5865

5966
if __name__ == '__main__':
6067
parser = ArgumentParser(
6168
description=
6269
"Patch benchmark_dataset.py to set streaming=False in load_dataset calls"
6370
)
64-
parser.add_argument("--path",
65-
type=str,
66-
help="Path to the benchmark_dataset.py file")
71+
parser.add_argument(
72+
"--path",
73+
type=str,
74+
default="/vllm-workspace/vllm/vllm/benchmarks/datasets.py",
75+
help="Path to the benchmark_dataset.py file")
6776
args = parser.parse_args()
6877
patch_file(args.path)

benchmarks/scripts/run-performance-benchmarks.sh

Lines changed: 19 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -54,13 +54,20 @@ json2args() {
5454
}
5555

5656
wait_for_server() {
57-
# wait for vllm server to start
58-
# return 1 if vllm server crashes
59-
timeout 1200 bash -c '
60-
until curl -s -X GET localhost:8000/health; do
61-
echo "Waiting for vllm server to start..."
62-
sleep 1
63-
done' && return 0 || return 1
57+
local waited=0
58+
local timeout_sec=1200
59+
60+
while (( waited < timeout_sec )); do
61+
if curl -s -X GET localhost:8000/health > /dev/null; then
62+
return 0
63+
fi
64+
echo "Waiting for vllm server to start..."
65+
sleep 1
66+
((waited++))
67+
done
68+
69+
echo "Timeout waiting for server"
70+
return 1
6471
}
6572

6673
get_cur_npu_id() {
@@ -114,7 +121,7 @@ run_latency_tests() {
114121
latency_params=$(echo "$params" | jq -r '.parameters')
115122
latency_args=$(json2args "$latency_params")
116123

117-
latency_command="python3 vllm_benchmarks/benchmark_latency.py \
124+
latency_command="vllm bench latency \
118125
--output-json $RESULTS_FOLDER/${test_name}.json \
119126
$latency_args"
120127

@@ -157,7 +164,7 @@ run_throughput_tests() {
157164
throughput_params=$(echo "$params" | jq -r '.parameters')
158165
throughput_args=$(json2args "$throughput_params")
159166

160-
throughput_command="python3 vllm_benchmarks/benchmark_throughput.py \
167+
throughput_command="vllm bench throughput \
161168
--output-json $RESULTS_FOLDER/${test_name}.json \
162169
$throughput_args"
163170

@@ -243,7 +250,7 @@ run_serving_tests() {
243250

244251
new_test_name=$test_name"_qps_"$qps
245252

246-
client_command="python3 vllm_benchmarks/benchmark_serving.py \
253+
client_command="vllm bench serve \
247254
--save-result \
248255
--result-dir $RESULTS_FOLDER \
249256
--result-filename ${new_test_name}.json \
@@ -271,17 +278,11 @@ cleanup_on_error() {
271278
rm -rf $RESULTS_FOLDER
272279
}
273280

274-
get_benchmarks_scripts() {
275-
git clone -b main --depth=1 https://github.com/vllm-project/vllm.git && \
276-
mv vllm/benchmarks vllm_benchmarks
277-
rm -rf ./vllm
278-
}
279-
280281
main() {
281-
282282
START_TIME=$(date +%s)
283283
check_npus
284-
284+
python3 benchmarks/scripts/patch_benchmark_dataset.py
285+
285286
# dependencies
286287
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
287288
(which jq) || (apt-get update && apt-get -y install jq)
@@ -298,8 +299,6 @@ main() {
298299

299300
# prepare for benchmarking
300301
cd benchmarks || exit 1
301-
get_benchmarks_scripts
302-
python3 scripts/patch_benchmark_dataset.py --path vllm_benchmarks/benchmark_dataset.py
303302
trap cleanup EXIT
304303

305304
QUICK_BENCHMARK_ROOT=./

benchmarks/tests/serving-tests.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
},
1919
"client_parameters": {
2020
"model": "Qwen/Qwen2.5-VL-7B-Instruct",
21-
"backend": "openai-chat",
21+
"endpoint_type": "openai-chat",
2222
"dataset_name": "hf",
2323
"hf_split": "train",
2424
"endpoint": "/v1/chat/completions",
@@ -44,7 +44,7 @@
4444
},
4545
"client_parameters": {
4646
"model": "Qwen/Qwen3-8B",
47-
"backend": "vllm",
47+
"endpoint_type": "vllm",
4848
"dataset_name": "sharegpt",
4949
"dataset_path": "/github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json",
5050
"num_prompts": 200
@@ -68,7 +68,7 @@
6868
},
6969
"client_parameters": {
7070
"model": "Qwen/Qwen2.5-7B-Instruct",
71-
"backend": "vllm",
71+
"endpoint_type": "vllm",
7272
"dataset_name": "sharegpt",
7373
"dataset_path": "/github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json",
7474
"num_prompts": 200

0 commit comments

Comments
 (0)