[CI][Benchmark] Add new model and v1 test to perf benchmarks (#1099)

Potabk · wangxiaoxin (A) · commit 0de5ef8991d8 · 2025-06-17T14:55:53.000+08:00
### What this PR does / why we need it?
- Add qwen2.5-7b-instruct test
- Add v1 test
---------

Signed-off-by: wangli &lt;wangli858794774@gmail.com&gt;
Signed-off-by: wangxiaoxin (A) &lt;wangxiaoxin7@huawei.com&gt;
diff --git a/.github/workflows/nightly_benchmarks.yaml b/.github/workflows/nightly_benchmarks.yaml
@@ -41,13 +41,18 @@ jobs:
   test:
     if: ${{ contains(github.event.pull_request.labels.*.name, 'performance-test') && contains(github.event.pull_request.labels.*.name, 'ready-for-test') || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }}
 
-    name: Benchmarks/vLLM=${{ matrix.vllm_branch }}, vLLM-Ascend=${{ matrix.vllm_ascend_branch }}
+    name: Benchmarks/vLLM=${{ matrix.vllm_branch }}, vLLM-Ascend=${{ matrix.vllm_ascend_branch }}, use_v1=${{ matrix.vllm_use_v1 }}
     runs-on: 'linux-arm64-npu-static-8'
     strategy:
       matrix:
         include:
           - vllm_branch: v0.9.1
             vllm_ascend_branch: main
+            vllm_use_v1: 0
+          - vllm_branch: v0.9.0
+            vllm_ascend_branch: main
+            vllm_use_v1: 1
+      max-parallel: 1
     container:
       image: m.daocloud.io/quay.io/ascend/cann:8.1.rc1-910b-ubuntu22.04-py3.10
       volumes:
@@ -67,6 +72,7 @@ jobs:
         HF_TOKEN: ${{ secrets.HF_TOKEN }}
         ES_OM_DOMAIN: ${{ secrets.ES_OM_DOMAIN }}
         ES_OM_AUTHORIZATION: ${{ secrets.ES_OM_AUTHORIZATION }}
+        VLLM_USE_V1: ${{ matrix.vllm_use_v1 }}
     steps:
       - name: Check npu and CANN info
         run: |
@@ -136,7 +142,7 @@ jobs:
       - name: Install elastic_tool
         if: github.event_name != 'pull_request'
         run: |
-          pip install escli-tool==0.2.1
+          pip install escli-tool==0.2.2
           
       - name: Collect pr info from vllm-project/vllm-ascend
         if: github.event_name != 'pull_request'
@@ -173,17 +179,17 @@ jobs:
             echo "vllm branch: ${{ matrix.vllm_branch }}"
             echo "vllm-ascend branch: ${{ matrix.vllm_ascend_branch }}"
             echo "------------------------"
+            
             cd /github/home
             bash benchmarks/scripts/run-performance-benchmarks.sh
             # send the result to es
-            if [[ "${{ github.event_name }}" != "pull request" ]]; then
-              escli add --vllm_branch ${{ matrix.vllm_branch }} \
-              --vllm_ascend_branch ${{ matrix.vllm_ascend_branch }} \
-              --commit_id $commit_id \
-              --commit_title "$commit_title" \
-              --created_at "$commit_time_no_tz" \
-              --res_dir ./benchmarks/results 
-              rm -rf ./benchmarks/results
-            fi
+            escli add --vllm_branch ${{ matrix.vllm_branch }} \
+            --vllm_ascend_branch ${{ matrix.vllm_ascend_branch }} \
+            --commit_id $commit_id \
+            --commit_title "$commit_title" \
+            --created_at "$commit_time_no_tz" \
+            --res_dir ./benchmarks/results \
+            --extra_feat '{"VLLM_USE_V1": "${{ matrix.vllm_use_v1 }}"}'
+            rm -rf ./benchmarks/results
             cd -
           done < commit_log.txt
diff --git a/benchmarks/tests/latency-tests.json b/benchmarks/tests/latency-tests.json
@@ -9,5 +9,15 @@
       "num_iters_warmup": 5,
       "num_iters": 15
     }
+  },
+  {
+    "test_name": "latency_qwen2_5_7B_tp1",
+    "parameters": {
+      "model": "Qwen/Qwen2.5-7B-Instruct",
+      "tensor_parallel_size": 1,
+      "load_format": "dummy",
+      "num_iters_warmup": 5,
+      "num_iters": 15
+    }
   }
 ]
diff --git a/benchmarks/tests/serving-tests.json b/benchmarks/tests/serving-tests.json
@@ -49,5 +49,29 @@
       "dataset_path": "/github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json",
       "num_prompts": 200
     }
+  },
+  {
+    "test_name": "serving_qwen2_5_7B_tp1",
+    "qps_list": [
+      1,
+      4,
+      16,
+      "inf"
+    ],
+    "server_parameters": {
+      "model": "Qwen/Qwen2.5-7B-Instruct",
+      "tensor_parallel_size": 1,
+      "swap_space": 16,
+      "disable_log_stats": "",
+      "disable_log_requests": "",
+      "load_format": "dummy"
+    },
+    "client_parameters": {
+      "model": "Qwen/Qwen2.5-7B-Instruct",
+      "backend": "vllm",
+      "dataset_name": "sharegpt",
+      "dataset_path": "/github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json",
+      "num_prompts": 200
+    }
   }
 ]
diff --git a/benchmarks/tests/throughput-tests.json b/benchmarks/tests/throughput-tests.json
@@ -22,6 +22,17 @@
       "dataset_path": "lmarena-ai/vision-arena-bench-v0.1",
       "num_prompts": 200
     }
+  },
+  {
+    "test_name": "throughput_qwen2_5_7B_tp1",
+    "parameters": {
+      "model": "Qwen/Qwen2.5-7B-Instruct",
+      "tensor_parallel_size": 1,
+      "load_format": "dummy",
+      "dataset_path": "/github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json",
+      "num_prompts": 200,
+      "backend": "vllm"
+    }
   }
 ]
 

Original file line number	Diff line number	Diff line change
`@@ -9,5 +9,15 @@`
`9`	`9`	`"num_iters_warmup": 5,`
`10`	`10`	`"num_iters": 15`
`11`	`11`	`}`
	`12`	`+ },`
	`13`	`+ {`
	`14`	`+ "test_name": "latency_qwen2_5_7B_tp1",`
	`15`	`+ "parameters": {`
	`16`	`+ "model": "Qwen/Qwen2.5-7B-Instruct",`
	`17`	`+ "tensor_parallel_size": 1,`
	`18`	`+ "load_format": "dummy",`
	`19`	`+ "num_iters_warmup": 5,`
	`20`	`+ "num_iters": 15`
	`21`	`+ }`
`12`	`22`	`}`
`13`	`23`	`]`
Original file line number	Diff line number	Diff line change
`@@ -22,6 +22,17 @@`
`22`	`22`	`"dataset_path": "lmarena-ai/vision-arena-bench-v0.1",`
`23`	`23`	`"num_prompts": 200`
`24`	`24`	`}`
	`25`	`+ },`
	`26`	`+ {`
	`27`	`+ "test_name": "throughput_qwen2_5_7B_tp1",`
	`28`	`+ "parameters": {`
	`29`	`+ "model": "Qwen/Qwen2.5-7B-Instruct",`
	`30`	`+ "tensor_parallel_size": 1,`
	`31`	`+ "load_format": "dummy",`
	`32`	`+ "dataset_path": "/github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json",`
	`33`	`+ "num_prompts": 200,`
	`34`	`+ "backend": "vllm"`
	`35`	`+ }`
`25`	`36`	`}`
`26`	`37`	`]`
`27`	`38`