diff --git a/.github/workflows/nightly_benchmarks.yaml b/.github/workflows/nightly_benchmarks.yaml index c419ac6275..8dcb218183 100644 --- a/.github/workflows/nightly_benchmarks.yaml +++ b/.github/workflows/nightly_benchmarks.yaml @@ -65,6 +65,8 @@ jobs: options: >- --device /dev/davinci0 --device /dev/davinci1 + --device /dev/davinci2 + --device /dev/davinci3 --device /dev/davinci_manager --device /dev/devmm_svm --device /dev/hisi_hdc diff --git a/benchmarks/scripts/patch_benchmark_dataset.py b/benchmarks/scripts/patch_benchmark_dataset.py index 5c8a6662d2..893ad67ae8 100644 --- a/benchmarks/scripts/patch_benchmark_dataset.py +++ b/benchmarks/scripts/patch_benchmark_dataset.py @@ -6,6 +6,8 @@ # Patch the benchmark_dataset.py file to set streaming=False in load_dataset calls +VLLM_EDITABLE_PATH = "/_w/vllm-ascend/vllm-ascend/vllm-empty/vllm/benchmarks/datasets.py" + # TDOO(Potabk): Remove this patch when the issue is fixed in the upstream class StreamingFalseTransformer(cst.CSTTransformer): @@ -68,10 +70,9 @@ def patch_file(path): description= "Patch benchmark_dataset.py to set streaming=False in load_dataset calls" ) - parser.add_argument( - "--path", - type=str, - default="/vllm-workspace/vllm/vllm/benchmarks/datasets.py", - help="Path to the benchmark_dataset.py file") + parser.add_argument("--path", + type=str, + default=VLLM_EDITABLE_PATH, + help="Path to the benchmark_dataset.py file") args = parser.parse_args() patch_file(args.path) diff --git a/benchmarks/scripts/run-performance-benchmarks.sh b/benchmarks/scripts/run-performance-benchmarks.sh index 085edb2435..8c7782d3f5 100644 --- a/benchmarks/scripts/run-performance-benchmarks.sh +++ b/benchmarks/scripts/run-performance-benchmarks.sh @@ -187,6 +187,15 @@ run_serving_tests() { local serving_test_file serving_test_file=$1 + cleanup() { + if [[ -n "$server_pid" ]]; then + echo "Cleaning up server process $server_pid..." + kill -9 "$server_pid" 2>/dev/null || true + fi + kill_npu_processes + } + + trap cleanup EXIT INT TERM # Iterate over serving tests jq -c '.[]' "$serving_test_file" | while read -r params; do @@ -267,11 +276,9 @@ run_serving_tests() { kill -9 $server_pid kill_npu_processes done + trap - EXIT INT TERM } -cleanup() { - rm -rf ./vllm_benchmarks -} cleanup_on_error() { echo "An error occurred. Cleaning up results folder..." diff --git a/benchmarks/tests/latency-tests.json b/benchmarks/tests/latency-tests.json index 40cec4c5f6..68733863c1 100644 --- a/benchmarks/tests/latency-tests.json +++ b/benchmarks/tests/latency-tests.json @@ -19,5 +19,25 @@ "num_iters_warmup": 5, "num_iters": 15 } + }, + { + "test_name": "latency_qwen3_30B_A3B_tp4", + "parameters": { + "model": "Qwen/Qwen3-30B-A3B", + "tensor_parallel_size": 4, + "load_format": "dummy", + "num_iters_warmup": 5, + "num_iters": 15 + } + }, + { + "test_name": "latency_qwen3_32B_tp4", + "parameters": { + "model": "Qwen/Qwen3-32B", + "tensor_parallel_size": 4, + "load_format": "dummy", + "num_iters_warmup": 5, + "num_iters": 15 + } } ] diff --git a/benchmarks/tests/serving-tests.json b/benchmarks/tests/serving-tests.json index 6398710f5b..7607f16576 100644 --- a/benchmarks/tests/serving-tests.json +++ b/benchmarks/tests/serving-tests.json @@ -73,5 +73,53 @@ "dataset_path": "/github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 200 } + }, + { + "test_name": "serving_qwen3_30B_A3B_tp4", + "qps_list": [ + 1, + 4, + 16, + "inf" + ], + "server_parameters": { + "model": "Qwen/Qwen3-30B-A3B", + "tensor_parallel_size": 4, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy" + }, + "client_parameters": { + "model": "Qwen/Qwen3-30B-A3B", + "endpoint_type": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "/github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_qwen3_32B_tp4", + "qps_list": [ + 1, + 4, + 16, + "inf" + ], + "server_parameters": { + "model": "Qwen/Qwen3-32B", + "tensor_parallel_size": 4, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy" + }, + "client_parameters": { + "model": "Qwen/Qwen3-32B", + "endpoint_type": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "/github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } } ] diff --git a/benchmarks/tests/throughput-tests.json b/benchmarks/tests/throughput-tests.json index 3698e69f3f..629955903b 100644 --- a/benchmarks/tests/throughput-tests.json +++ b/benchmarks/tests/throughput-tests.json @@ -33,6 +33,28 @@ "num_prompts": 200, "backend": "vllm" } + }, + { + "test_name": "throughput_qwen3_30B_A3B_tp4", + "parameters": { + "model": "Qwen/Qwen3-30B-A3B", + "tensor_parallel_size": 4, + "load_format": "dummy", + "dataset_path": "/github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm" + } + }, + { + "test_name": "throughput_qwen3_32B_tp4", + "parameters": { + "model": "Qwen/Qwen3-32B", + "tensor_parallel_size": 4, + "load_format": "dummy", + "dataset_path": "/github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm" + } } ]