[CI] Add benchmark workflows (#1014)

Potabk · web-flow · commit d9fb027068c6 · 2025-05-30T22:42:44.000+08:00
### What this PR does / why we need it?

Add benchmark workflows

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
Run locally

---------

Signed-off-by: wangli &lt;wangli858794774@gmail.com&gt;
diff --git a/.github/workflows/nightly_benchmarks.yaml b/.github/workflows/nightly_benchmarks.yaml
@@ -0,0 +1,160 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+name: 'run benchmarks main'
+
+on:
+  schedule:
+    - cron: '00 16 * * *'
+  workflow_dispatch:
+  
+  # after merged, secrets will be available
+  # pull_request:
+  #   branches:
+  #       - 'main'
+  #       - '*-dev'
+  #   paths:
+  #       - '.github/workflows/nightly_benchmarks.yaml'
+
+
+# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
+# declared as "shell: bash -el {0}" on steps that need to be properly activated.
+# It's used to activate ascend-toolkit environment variables.
+defaults:
+  run:
+    shell: bash -el {0}
+
+jobs:
+  test:
+    name: run benchmarks main
+    runs-on: 'linux-arm64-npu-static-8'
+    strategy:
+      matrix:
+        include:
+          - vllm_branch: v0.9.0
+            vllm_ascend_branch: main
+    container:
+      image: m.daocloud.io/quay.io/ascend/cann:8.1.rc1-910b-ubuntu22.04-py3.10
+      volumes:
+        - /usr/local/dcmi:/usr/local/dcmi
+        - /usr/local/bin/npu-smi:/usr/local/bin/npu-smi
+        - /usr/local/Ascend/driver/:/usr/local/Ascend/driver/
+        # Use self-host cache speed up pip and model download
+        - /home/action/.cache:/github/home/.cache/
+      options: >-
+        --device /dev/davinci0
+        --device /dev/davinci1
+        --device /dev/davinci_manager
+        --device /dev/devmm_svm
+        --device /dev/hisi_hdc
+      env:
+        HF_ENDPOINT: https://hf-mirror.com
+        HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        ES_OM_DOMAIN: ${{ secrets.ES_OM_DOMAIN }}
+        ES_OM_AUTHORIZATION: ${{ secrets.ES_OM_AUTHORIZATION }}
+    steps:
+      - name: Check npu and CANN info
+        run: |
+          npu-smi info
+          cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
+
+      - name: Config mirrors
+        run: |
+          pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
+
+      - name: Install system dependencies
+        run: |
+          apt-get update -y
+          apt-get -y install git jq wget curl lsof gcc g++ cmake libnuma-dev
+
+      - name: Config git
+        run: |
+          git config --global --add safe.directory "$GITHUB_WORKSPACE"
+          git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/
+    
+
+      - name: Checkout vllm-project/vllm-ascend repo
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ matrix.vllm_ascend_branch }}
+          
+      - name: Checkout vllm-project/vllm repo
+        uses: actions/checkout@v4
+        with:
+          repository: vllm-project/vllm
+          path: ./vllm-empty
+          ref: ${{  matrix.vllm_branch }}
+
+      - name: Install vllm-project/vllm from source
+        working-directory: ./vllm-empty
+        run: |
+          VLLM_TARGET_DEVICE=empty pip install -e .
+
+      - name: Install vllm-project/vllm-ascend
+        run: |
+          pip install -e .
+          pip install -r benchmarks/requirements-bench.txt
+
+      - name: Checkout cosdt/elastic-tool
+        uses: actions/checkout@v4
+        with:
+          repository: cosdt/elastic-tool
+          path: ./elastic_tool
+          ref: 0.1.0-dev
+
+      - name: Install elastic_tool
+        working-directory: ./elastic_tool
+        run: |
+          pip install -e .
+          
+      - name: Collect pr info from vllm-project/vllm-ascend
+        run: |
+          # Only get the pull request which may influences performance
+          git log --pretty=format:"%H %s" -- '**/*.py' ':!docs/*' ':!tests/*' ':!examples/*' > commit_log.txt
+          escli check commit_log.txt
+
+      - name: Run benchmark iteration
+        run: |
+          while IFS= read -r line || [[ -n "$line" ]]; do
+            commit_id=${line%% *}
+            commit_title=${line#* }
+            commit_time=$(git show -s --format=%cd $commit_hash --date=iso-strict)
+            commit_time_no_tz=${commit_time::19}
+
+            git checkout $commit_id
+            pip install -e .
+            
+            echo "------------------------"
+            echo "commit_id: $commit_id"
+            echo "commit_title: $commit_title"
+            echo "commit_time: $commit_time_no_tz"
+            echo "vllm branch: ${{ matrix.vllm_branch }}"
+            echo "vllm-ascend branch: ${{ matrix.vllm_ascend_branch }}"
+            echo "------------------------"
+
+            bash benchmarks/scripts/run-performance-benchmarks.sh
+            # send the result to es
+            if [[ "${{ github.event_name }}" != "pull request" ]]; then
+              escli add --vllm_branch ${{ matrix.vllm_branch }} \
+              --vllm_ascend_branch ${{ matrix.vllm_ascend_branch }} \
+              --commit_id $commit_id \
+              --commit_title "$commit_title" \
+              --created_at "$commit_time_no_tz" \
+              --res_dir ./benchmarks/results 
+              rm -rf ./benchmarks/results
+            fi
+          done < commit_log.txt
diff --git a/benchmarks/scripts/run-performance-benchmarks.sh b/benchmarks/scripts/run-performance-benchmarks.sh
@@ -1,5 +1,6 @@
 #!/bin/bash
 
+set -e
 
 check_npus() {
   # shellcheck disable=SC2155
@@ -48,7 +49,7 @@ wait_for_server() {
   # wait for vllm server to start
   # return 1 if vllm server crashes
   timeout 1200 bash -c '
-    until curl -X POST localhost:8000/v1/completions; do
+    until curl -s -X POST localhost:8000/v1/completions || curl -s -X POST localhost:8000/v1/chat/completions; do
       sleep 1
     done' && return 0 || return 1
 }
@@ -67,6 +68,16 @@ kill_npu_processes() {
 
 }
 
+update_json_field() {
+  local json_file="$1"
+  local field_name="$2"
+  local field_value="$3"
+
+  jq --arg value "$field_value" \
+     --arg key "$field_name" \
+     '.[$key] = $value' "$json_file" > "${json_file}.tmp" && \
+     mv "${json_file}.tmp" "$json_file"
+}
 
 run_latency_tests() {
   # run latency tests using `benchmark_latency.py`
@@ -103,7 +114,9 @@ run_latency_tests() {
 
     # run the benchmark
     eval "$latency_command"
-
+    # echo model_name to result file
+    model_name=$(echo "$latency_params" | jq -r '.model')
+    update_json_field "$RESULTS_FOLDER/${test_name}.json" "model_name" "$model_name"
     kill_npu_processes
 
   done
@@ -144,7 +157,9 @@ run_throughput_tests() {
 
     # run the benchmark
     eval "$throughput_command"
-
+    # echo model_name to result file
+    model_name=$(echo "$throughput_params" | jq -r '.model')
+    update_json_field "$RESULTS_FOLDER/${test_name}.json" "model_name" "$model_name"
     kill_npu_processes
 
   done
@@ -242,8 +257,13 @@ cleanup() {
   rm -rf ./vllm_benchmarks
 }
 
+cleanup_on_error() {
+  echo "An error occurred. Cleaning up results folder..."
+  rm -rf $RESULTS_FOLDER
+}
+
 get_benchmarks_scripts() {
-  git clone -b main --depth=1 git@github.com:vllm-project/vllm.git && \
+  git clone -b main --depth=1 https://github.com/vllm-project/vllm.git && \
   mv vllm/benchmarks vllm_benchmarks
   rm -rf ./vllm
 }
@@ -263,9 +283,8 @@ main() {
   export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
   # turn of the reporting of the status of each request, to clean up the terminal output
   export VLLM_LOG_LEVEL="WARNING"
-
+  
   # set env
-  export VLLM_USE_MODELSCOPE="True"
   export HF_ENDPOINT="https://hf-mirror.com"
 
   # prepare for benchmarking
@@ -278,6 +297,7 @@ main() {
   declare -g RESULTS_FOLDER=results
   mkdir -p $RESULTS_FOLDER
 
+  trap cleanup_on_error ERR
   ensure_sharegpt_downloaded
   # benchmarks
   run_serving_tests $QUICK_BENCHMARK_ROOT/tests/serving-tests.json
diff --git a/benchmarks/tests/latency-tests.json b/benchmarks/tests/latency-tests.json
@@ -1,20 +1,21 @@
 [
   {
-    "test_name": "latency_llama8B_tp1",
+    "test_name": "latency_qwen2_5vl_7B_tp1",
     "parameters": {
-      "model": "LLM-Research/Meta-Llama-3.1-8B-Instruct",
+      "model": "Qwen/Qwen2.5-VL-7B-Instruct",
       "tensor_parallel_size": 1,
-      "load_format": "dummy",
+      "max_model_len": 16384,
       "num_iters_warmup": 5,
       "num_iters": 15
     }
   },
   {
-    "test_name": "latency_qwen2_5_7B_tp1",
+    "test_name": "latency_qwen3_8B_tp1",
     "parameters": {
-      "model": "Qwen/Qwen2.5-7B-Instruct",
+      "model": "Qwen/Qwen3-8B",
       "tensor_parallel_size": 1,
       "load_format": "dummy",
+      "max_model_len": 16384,
       "num_iters_warmup": 5,
       "num_iters": 15
     }
diff --git a/benchmarks/tests/serving-tests.json b/benchmarks/tests/serving-tests.json
@@ -1,49 +1,52 @@
 [
   {
-    "test_name": "serving_llama8B_tp1",
+    "test_name": "serving_qwen2_5vl_7B_tp1",
     "qps_list": [
       1,
       4,
       16,
       "inf"
     ],
     "server_parameters": {
-      "model": "LLM-Research/Meta-Llama-3.1-8B-Instruct",
+      "model": "Qwen/Qwen2.5-VL-7B-Instruct",
       "tensor_parallel_size": 1,
       "swap_space": 16,
       "disable_log_stats": "",
       "disable_log_requests": "",
-      "load_format": "dummy"
+      "trust_remote_code": "",
+      "max_model_len": 16384
     },
     "client_parameters": {
-      "model": "LLM-Research/Meta-Llama-3.1-8B-Instruct",
-      "backend": "vllm",
-      "dataset_name": "sharegpt",
-      "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+      "model": "Qwen/Qwen2.5-VL-7B-Instruct",
+      "backend": "openai-chat",
+      "dataset_name": "hf",
+      "hf_split": "train",
+      "endpoint": "/v1/chat/completions",
+      "dataset_path": "lmarena-ai/vision-arena-bench-v0.1",
       "num_prompts": 200
     }
   },
   {
-    "test_name": "serving_qwen2_5_7B_tp1",
+    "test_name": "serving_qwen3_8B_tp1",
     "qps_list": [
       1,
       4,
       16,
       "inf"
     ],
     "server_parameters": {
-      "model": "Qwen/Qwen2.5-7B-Instruct",
+      "model": "Qwen/Qwen3-8B",
       "tensor_parallel_size": 1,
       "swap_space": 16,
       "disable_log_stats": "",
       "disable_log_requests": "",
       "load_format": "dummy"
     },
     "client_parameters": {
-      "model": "Qwen/Qwen2.5-7B-Instruct",
+      "model": "Qwen/Qwen3-8B",
       "backend": "vllm",
       "dataset_name": "sharegpt",
-      "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+      "dataset_path": "/root/.cache/datasets/sharegpt/ShareGPT_V3_unfiltered_cleaned_split.json",
       "num_prompts": 200
     }
   }
diff --git a/benchmarks/tests/throughput-tests.json b/benchmarks/tests/throughput-tests.json
@@ -1,24 +1,26 @@
 [
   {
-    "test_name": "throughput_llama8B_tp1",
+    "test_name": "throughput_qwen3_8B_tp1",
     "parameters": {
-      "model": "LLM-Research/Meta-Llama-3.1-8B-Instruct",
+      "model": "Qwen/Qwen3-8B",
       "tensor_parallel_size": 1,
       "load_format": "dummy",
-      "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+      "dataset_path": "/root/.cache/datasets/sharegpt/ShareGPT_V3_unfiltered_cleaned_split.json",
       "num_prompts": 200,
       "backend": "vllm"
     }
   },
   {
-    "test_name": "throughput_qwen2_5_7B_tp1",
+    "test_name": "throughput_qwen2_5vl_7B_tp1",
     "parameters": {
-      "model": "Qwen/Qwen2.5-7B-Instruct",
+      "model": "Qwen/Qwen2.5-VL-7B-Instruct",
       "tensor_parallel_size": 1,
-      "load_format": "dummy",
-      "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-      "num_prompts": 200,
-      "backend": "vllm"
+      "backend": "vllm-chat",
+      "dataset_name": "hf",
+      "hf_split": "train",
+      "max_model_len": 16384,
+      "dataset_path": "lmarena-ai/vision-arena-bench-v0.1",
+      "num_prompts": 200
     }
   }
 ]

Original file line number	Diff line number	Diff line change
`@@ -1,20 +1,21 @@`
`1`	`1`	`[`
`2`	`2`	`{`
`3`		`- "test_name": "latency_llama8B_tp1",`
	`3`	`+ "test_name": "latency_qwen2_5vl_7B_tp1",`
`4`	`4`	`"parameters": {`
`5`		`- "model": "LLM-Research/Meta-Llama-3.1-8B-Instruct",`
	`5`	`+ "model": "Qwen/Qwen2.5-VL-7B-Instruct",`
`6`	`6`	`"tensor_parallel_size": 1,`
`7`		`- "load_format": "dummy",`
	`7`	`+ "max_model_len": 16384,`
`8`	`8`	`"num_iters_warmup": 5,`
`9`	`9`	`"num_iters": 15`
`10`	`10`	`}`
`11`	`11`	`},`
`12`	`12`	`{`
`13`		`- "test_name": "latency_qwen2_5_7B_tp1",`
	`13`	`+ "test_name": "latency_qwen3_8B_tp1",`
`14`	`14`	`"parameters": {`
`15`		`- "model": "Qwen/Qwen2.5-7B-Instruct",`
	`15`	`+ "model": "Qwen/Qwen3-8B",`
`16`	`16`	`"tensor_parallel_size": 1,`
`17`	`17`	`"load_format": "dummy",`
	`18`	`+ "max_model_len": 16384,`
`18`	`19`	`"num_iters_warmup": 5,`
`19`	`20`	`"num_iters": 15`
`20`	`21`	`}`
Original file line number	Diff line number	Diff line change
`@@ -1,24 +1,26 @@`
`1`	`1`	`[`
`2`	`2`	`{`
`3`		`- "test_name": "throughput_llama8B_tp1",`
	`3`	`+ "test_name": "throughput_qwen3_8B_tp1",`
`4`	`4`	`"parameters": {`
`5`		`- "model": "LLM-Research/Meta-Llama-3.1-8B-Instruct",`
	`5`	`+ "model": "Qwen/Qwen3-8B",`
`6`	`6`	`"tensor_parallel_size": 1,`
`7`	`7`	`"load_format": "dummy",`
`8`		`- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",`
	`8`	`+ "dataset_path": "/root/.cache/datasets/sharegpt/ShareGPT_V3_unfiltered_cleaned_split.json",`
`9`	`9`	`"num_prompts": 200,`
`10`	`10`	`"backend": "vllm"`
`11`	`11`	`}`
`12`	`12`	`},`
`13`	`13`	`{`
`14`		`- "test_name": "throughput_qwen2_5_7B_tp1",`
	`14`	`+ "test_name": "throughput_qwen2_5vl_7B_tp1",`
`15`	`15`	`"parameters": {`
`16`		`- "model": "Qwen/Qwen2.5-7B-Instruct",`
	`16`	`+ "model": "Qwen/Qwen2.5-VL-7B-Instruct",`
`17`	`17`	`"tensor_parallel_size": 1,`
`18`		`- "load_format": "dummy",`
`19`		`- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",`
`20`		`- "num_prompts": 200,`
`21`		`- "backend": "vllm"`
	`18`	`+ "backend": "vllm-chat",`
	`19`	`+ "dataset_name": "hf",`
	`20`	`+ "hf_split": "train",`
	`21`	`+ "max_model_len": 16384,`
	`22`	`+ "dataset_path": "lmarena-ai/vision-arena-bench-v0.1",`
	`23`	`+ "num_prompts": 200`
`22`	`24`	`}`
`23`	`25`	`}`
`24`	`26`	`]`