From 892e873d9dd25f8468232ecbfe4c830c12d02405 Mon Sep 17 00:00:00 2001 From: wangli Date: Thu, 3 Jul 2025 14:20:32 +0800 Subject: [PATCH 1/5] add qwen3 model tests Signed-off-by: wangli --- .github/workflows/nightly_benchmarks.yaml | 2 + benchmarks/scripts/patch_benchmark_dataset.py | 11 +++-- benchmarks/tests/latency-tests.json | 20 ++++++++ benchmarks/tests/serving-tests.json | 48 +++++++++++++++++++ benchmarks/tests/throughput-tests.json | 22 +++++++++ 5 files changed, 98 insertions(+), 5 deletions(-) diff --git a/.github/workflows/nightly_benchmarks.yaml b/.github/workflows/nightly_benchmarks.yaml index c419ac6275..8dcb218183 100644 --- a/.github/workflows/nightly_benchmarks.yaml +++ b/.github/workflows/nightly_benchmarks.yaml @@ -65,6 +65,8 @@ jobs: options: >- --device /dev/davinci0 --device /dev/davinci1 + --device /dev/davinci2 + --device /dev/davinci3 --device /dev/davinci_manager --device /dev/devmm_svm --device /dev/hisi_hdc diff --git a/benchmarks/scripts/patch_benchmark_dataset.py b/benchmarks/scripts/patch_benchmark_dataset.py index 5c8a6662d2..893ad67ae8 100644 --- a/benchmarks/scripts/patch_benchmark_dataset.py +++ b/benchmarks/scripts/patch_benchmark_dataset.py @@ -6,6 +6,8 @@ # Patch the benchmark_dataset.py file to set streaming=False in load_dataset calls +VLLM_EDITABLE_PATH = "/_w/vllm-ascend/vllm-ascend/vllm-empty/vllm/benchmarks/datasets.py" + # TDOO(Potabk): Remove this patch when the issue is fixed in the upstream class StreamingFalseTransformer(cst.CSTTransformer): @@ -68,10 +70,9 @@ def patch_file(path): description= "Patch benchmark_dataset.py to set streaming=False in load_dataset calls" ) - parser.add_argument( - "--path", - type=str, - default="/vllm-workspace/vllm/vllm/benchmarks/datasets.py", - help="Path to the benchmark_dataset.py file") + parser.add_argument("--path", + type=str, + default=VLLM_EDITABLE_PATH, + help="Path to the benchmark_dataset.py file") args = parser.parse_args() patch_file(args.path) diff --git a/benchmarks/tests/latency-tests.json b/benchmarks/tests/latency-tests.json index 40cec4c5f6..68733863c1 100644 --- a/benchmarks/tests/latency-tests.json +++ b/benchmarks/tests/latency-tests.json @@ -19,5 +19,25 @@ "num_iters_warmup": 5, "num_iters": 15 } + }, + { + "test_name": "latency_qwen3_30B_A3B_tp4", + "parameters": { + "model": "Qwen/Qwen3-30B-A3B", + "tensor_parallel_size": 4, + "load_format": "dummy", + "num_iters_warmup": 5, + "num_iters": 15 + } + }, + { + "test_name": "latency_qwen3_32B_tp4", + "parameters": { + "model": "Qwen/Qwen3-32B", + "tensor_parallel_size": 4, + "load_format": "dummy", + "num_iters_warmup": 5, + "num_iters": 15 + } } ] diff --git a/benchmarks/tests/serving-tests.json b/benchmarks/tests/serving-tests.json index 6398710f5b..664504371e 100644 --- a/benchmarks/tests/serving-tests.json +++ b/benchmarks/tests/serving-tests.json @@ -73,5 +73,53 @@ "dataset_path": "/github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 200 } + }, + { + "test_name": "serving_qwen3_30B_A3B_tp1", + "qps_list": [ + 1, + 4, + 16, + "inf" + ], + "server_parameters": { + "model": "Qwen/Qwen3-30B-A3B", + "tensor_parallel_size": 4, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy" + }, + "client_parameters": { + "model": "Qwen/Qwen3-30B-A3B", + "endpoint_type": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "/github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_qwen3_32B_tp1", + "qps_list": [ + 1, + 4, + 16, + "inf" + ], + "server_parameters": { + "model": "Qwen/Qwen3-32B", + "tensor_parallel_size": 4, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy" + }, + "client_parameters": { + "model": "Qwen/Qwen3-32B", + "endpoint_type": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "/github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } } ] diff --git a/benchmarks/tests/throughput-tests.json b/benchmarks/tests/throughput-tests.json index 3698e69f3f..756849f4b8 100644 --- a/benchmarks/tests/throughput-tests.json +++ b/benchmarks/tests/throughput-tests.json @@ -33,6 +33,28 @@ "num_prompts": 200, "backend": "vllm" } + }, + { + "test_name": "throughput_qwen3_30B_A3B_tp1", + "parameters": { + "model": "Qwen/Qwen3-30B-A3B", + "tensor_parallel_size": 4, + "load_format": "dummy", + "dataset_path": "/github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm" + } + }, + { + "test_name": "throughput_qwen3_32B_tp1", + "parameters": { + "model": "Qwen/Qwen3-32B", + "tensor_parallel_size": 4, + "load_format": "dummy", + "dataset_path": "/github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm" + } } ] From c922df172b9852476b8c21599b4dcd29f29d828d Mon Sep 17 00:00:00 2001 From: wangli Date: Thu, 3 Jul 2025 14:48:25 +0800 Subject: [PATCH 2/5] test new nodel Signed-off-by: wangli --- benchmarks/tests/latency-tests.json | 21 -------- benchmarks/tests/serving-tests.json | 75 -------------------------- benchmarks/tests/throughput-tests.json | 35 ------------ 3 files changed, 131 deletions(-) diff --git a/benchmarks/tests/latency-tests.json b/benchmarks/tests/latency-tests.json index 68733863c1..ea353a504f 100644 --- a/benchmarks/tests/latency-tests.json +++ b/benchmarks/tests/latency-tests.json @@ -1,25 +1,4 @@ [ - { - "test_name": "latency_qwen3_8B_tp1", - "parameters": { - "model": "Qwen/Qwen3-8B", - "tensor_parallel_size": 1, - "load_format": "dummy", - "max_model_len": 16384, - "num_iters_warmup": 5, - "num_iters": 15 - } - }, - { - "test_name": "latency_qwen2_5_7B_tp1", - "parameters": { - "model": "Qwen/Qwen2.5-7B-Instruct", - "tensor_parallel_size": 1, - "load_format": "dummy", - "num_iters_warmup": 5, - "num_iters": 15 - } - }, { "test_name": "latency_qwen3_30B_A3B_tp4", "parameters": { diff --git a/benchmarks/tests/serving-tests.json b/benchmarks/tests/serving-tests.json index 664504371e..202c86e3ca 100644 --- a/benchmarks/tests/serving-tests.json +++ b/benchmarks/tests/serving-tests.json @@ -1,79 +1,4 @@ [ - { - "test_name": "serving_qwen2_5vl_7B_tp1", - "qps_list": [ - 1, - 4, - 16, - "inf" - ], - "server_parameters": { - "model": "Qwen/Qwen2.5-VL-7B-Instruct", - "tensor_parallel_size": 1, - "swap_space": 16, - "disable_log_stats": "", - "disable_log_requests": "", - "trust_remote_code": "", - "max_model_len": 16384 - }, - "client_parameters": { - "model": "Qwen/Qwen2.5-VL-7B-Instruct", - "endpoint_type": "openai-chat", - "dataset_name": "hf", - "hf_split": "train", - "endpoint": "/v1/chat/completions", - "dataset_path": "lmarena-ai/vision-arena-bench-v0.1", - "num_prompts": 200 - } - }, - { - "test_name": "serving_qwen3_8B_tp1", - "qps_list": [ - 1, - 4, - 16, - "inf" - ], - "server_parameters": { - "model": "Qwen/Qwen3-8B", - "tensor_parallel_size": 1, - "swap_space": 16, - "disable_log_stats": "", - "disable_log_requests": "", - "load_format": "dummy" - }, - "client_parameters": { - "model": "Qwen/Qwen3-8B", - "endpoint_type": "vllm", - "dataset_name": "sharegpt", - "dataset_path": "/github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 - } - }, - { - "test_name": "serving_qwen2_5_7B_tp1", - "qps_list": [ - 1, - 4, - 16, - "inf" - ], - "server_parameters": { - "model": "Qwen/Qwen2.5-7B-Instruct", - "tensor_parallel_size": 1, - "swap_space": 16, - "disable_log_stats": "", - "disable_log_requests": "", - "load_format": "dummy" - }, - "client_parameters": { - "model": "Qwen/Qwen2.5-7B-Instruct", - "endpoint_type": "vllm", - "dataset_name": "sharegpt", - "dataset_path": "/github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 - } - }, { "test_name": "serving_qwen3_30B_A3B_tp1", "qps_list": [ diff --git a/benchmarks/tests/throughput-tests.json b/benchmarks/tests/throughput-tests.json index 756849f4b8..a28edb1f10 100644 --- a/benchmarks/tests/throughput-tests.json +++ b/benchmarks/tests/throughput-tests.json @@ -1,39 +1,4 @@ [ - { - "test_name": "throughput_qwen3_8B_tp1", - "parameters": { - "model": "Qwen/Qwen3-8B", - "tensor_parallel_size": 1, - "load_format": "dummy", - "dataset_path": "/github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200, - "backend": "vllm" - } - }, - { - "test_name": "throughput_qwen2_5vl_7B_tp1", - "parameters": { - "model": "Qwen/Qwen2.5-VL-7B-Instruct", - "tensor_parallel_size": 1, - "backend": "vllm-chat", - "dataset_name": "hf", - "hf_split": "train", - "max_model_len": 16384, - "dataset_path": "lmarena-ai/vision-arena-bench-v0.1", - "num_prompts": 200 - } - }, - { - "test_name": "throughput_qwen2_5_7B_tp1", - "parameters": { - "model": "Qwen/Qwen2.5-7B-Instruct", - "tensor_parallel_size": 1, - "load_format": "dummy", - "dataset_path": "/github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200, - "backend": "vllm" - } - }, { "test_name": "throughput_qwen3_30B_A3B_tp1", "parameters": { From 8ecd7141055bd14920bafdf84ba7e54ad93fb3cc Mon Sep 17 00:00:00 2001 From: wangli Date: Thu, 3 Jul 2025 15:09:55 +0800 Subject: [PATCH 3/5] add trap Signed-off-by: wangli --- benchmarks/scripts/run-performance-benchmarks.sh | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/benchmarks/scripts/run-performance-benchmarks.sh b/benchmarks/scripts/run-performance-benchmarks.sh index 085edb2435..8c7782d3f5 100644 --- a/benchmarks/scripts/run-performance-benchmarks.sh +++ b/benchmarks/scripts/run-performance-benchmarks.sh @@ -187,6 +187,15 @@ run_serving_tests() { local serving_test_file serving_test_file=$1 + cleanup() { + if [[ -n "$server_pid" ]]; then + echo "Cleaning up server process $server_pid..." + kill -9 "$server_pid" 2>/dev/null || true + fi + kill_npu_processes + } + + trap cleanup EXIT INT TERM # Iterate over serving tests jq -c '.[]' "$serving_test_file" | while read -r params; do @@ -267,11 +276,9 @@ run_serving_tests() { kill -9 $server_pid kill_npu_processes done + trap - EXIT INT TERM } -cleanup() { - rm -rf ./vllm_benchmarks -} cleanup_on_error() { echo "An error occurred. Cleaning up results folder..." From 7c418cf0d5452da51a2149c94e17a339a33bb124 Mon Sep 17 00:00:00 2001 From: wangli Date: Thu, 3 Jul 2025 16:25:43 +0800 Subject: [PATCH 4/5] revert test Signed-off-by: wangli --- benchmarks/tests/latency-tests.json | 21 ++++++++ benchmarks/tests/serving-tests.json | 75 ++++++++++++++++++++++++++ benchmarks/tests/throughput-tests.json | 35 ++++++++++++ 3 files changed, 131 insertions(+) diff --git a/benchmarks/tests/latency-tests.json b/benchmarks/tests/latency-tests.json index ea353a504f..68733863c1 100644 --- a/benchmarks/tests/latency-tests.json +++ b/benchmarks/tests/latency-tests.json @@ -1,4 +1,25 @@ [ + { + "test_name": "latency_qwen3_8B_tp1", + "parameters": { + "model": "Qwen/Qwen3-8B", + "tensor_parallel_size": 1, + "load_format": "dummy", + "max_model_len": 16384, + "num_iters_warmup": 5, + "num_iters": 15 + } + }, + { + "test_name": "latency_qwen2_5_7B_tp1", + "parameters": { + "model": "Qwen/Qwen2.5-7B-Instruct", + "tensor_parallel_size": 1, + "load_format": "dummy", + "num_iters_warmup": 5, + "num_iters": 15 + } + }, { "test_name": "latency_qwen3_30B_A3B_tp4", "parameters": { diff --git a/benchmarks/tests/serving-tests.json b/benchmarks/tests/serving-tests.json index 202c86e3ca..664504371e 100644 --- a/benchmarks/tests/serving-tests.json +++ b/benchmarks/tests/serving-tests.json @@ -1,4 +1,79 @@ [ + { + "test_name": "serving_qwen2_5vl_7B_tp1", + "qps_list": [ + 1, + 4, + 16, + "inf" + ], + "server_parameters": { + "model": "Qwen/Qwen2.5-VL-7B-Instruct", + "tensor_parallel_size": 1, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "trust_remote_code": "", + "max_model_len": 16384 + }, + "client_parameters": { + "model": "Qwen/Qwen2.5-VL-7B-Instruct", + "endpoint_type": "openai-chat", + "dataset_name": "hf", + "hf_split": "train", + "endpoint": "/v1/chat/completions", + "dataset_path": "lmarena-ai/vision-arena-bench-v0.1", + "num_prompts": 200 + } + }, + { + "test_name": "serving_qwen3_8B_tp1", + "qps_list": [ + 1, + 4, + 16, + "inf" + ], + "server_parameters": { + "model": "Qwen/Qwen3-8B", + "tensor_parallel_size": 1, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy" + }, + "client_parameters": { + "model": "Qwen/Qwen3-8B", + "endpoint_type": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "/github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_qwen2_5_7B_tp1", + "qps_list": [ + 1, + 4, + 16, + "inf" + ], + "server_parameters": { + "model": "Qwen/Qwen2.5-7B-Instruct", + "tensor_parallel_size": 1, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy" + }, + "client_parameters": { + "model": "Qwen/Qwen2.5-7B-Instruct", + "endpoint_type": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "/github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, { "test_name": "serving_qwen3_30B_A3B_tp1", "qps_list": [ diff --git a/benchmarks/tests/throughput-tests.json b/benchmarks/tests/throughput-tests.json index a28edb1f10..756849f4b8 100644 --- a/benchmarks/tests/throughput-tests.json +++ b/benchmarks/tests/throughput-tests.json @@ -1,4 +1,39 @@ [ + { + "test_name": "throughput_qwen3_8B_tp1", + "parameters": { + "model": "Qwen/Qwen3-8B", + "tensor_parallel_size": 1, + "load_format": "dummy", + "dataset_path": "/github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm" + } + }, + { + "test_name": "throughput_qwen2_5vl_7B_tp1", + "parameters": { + "model": "Qwen/Qwen2.5-VL-7B-Instruct", + "tensor_parallel_size": 1, + "backend": "vllm-chat", + "dataset_name": "hf", + "hf_split": "train", + "max_model_len": 16384, + "dataset_path": "lmarena-ai/vision-arena-bench-v0.1", + "num_prompts": 200 + } + }, + { + "test_name": "throughput_qwen2_5_7B_tp1", + "parameters": { + "model": "Qwen/Qwen2.5-7B-Instruct", + "tensor_parallel_size": 1, + "load_format": "dummy", + "dataset_path": "/github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm" + } + }, { "test_name": "throughput_qwen3_30B_A3B_tp1", "parameters": { From 2476ab2eb88aa64136ddb0b5567a71a29bf2c63a Mon Sep 17 00:00:00 2001 From: wangli Date: Thu, 3 Jul 2025 16:28:08 +0800 Subject: [PATCH 5/5] fix format Signed-off-by: wangli --- benchmarks/tests/serving-tests.json | 4 ++-- benchmarks/tests/throughput-tests.json | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/benchmarks/tests/serving-tests.json b/benchmarks/tests/serving-tests.json index 664504371e..7607f16576 100644 --- a/benchmarks/tests/serving-tests.json +++ b/benchmarks/tests/serving-tests.json @@ -75,7 +75,7 @@ } }, { - "test_name": "serving_qwen3_30B_A3B_tp1", + "test_name": "serving_qwen3_30B_A3B_tp4", "qps_list": [ 1, 4, @@ -99,7 +99,7 @@ } }, { - "test_name": "serving_qwen3_32B_tp1", + "test_name": "serving_qwen3_32B_tp4", "qps_list": [ 1, 4, diff --git a/benchmarks/tests/throughput-tests.json b/benchmarks/tests/throughput-tests.json index 756849f4b8..629955903b 100644 --- a/benchmarks/tests/throughput-tests.json +++ b/benchmarks/tests/throughput-tests.json @@ -35,7 +35,7 @@ } }, { - "test_name": "throughput_qwen3_30B_A3B_tp1", + "test_name": "throughput_qwen3_30B_A3B_tp4", "parameters": { "model": "Qwen/Qwen3-30B-A3B", "tensor_parallel_size": 4, @@ -46,7 +46,7 @@ } }, { - "test_name": "throughput_qwen3_32B_tp1", + "test_name": "throughput_qwen3_32B_tp4", "parameters": { "model": "Qwen/Qwen3-32B", "tensor_parallel_size": 4,