Upgrade vLLM version to v0.9.2 (#1652)

Yikun · web-flow · commit e4e9ea02ab2d · 2025-07-08T14:18:17.000+08:00
### What this PR does / why we need it? This patch upgrade vLLM version to v0.9.2, this patch didn't remove the v0.9.1 compatible code to easy review. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? - vLLM version: v0.9.1 - vLLM main: vllm-project/vllm@14601f5 - Accuracy test with 0.9.2: https://github.com/vllm-project/vllm-ascend/actions/runs/16121612087 Signed-off-by: Yikun Jiang <yikunkero@gmail.com>
diff --git a/.github/workflows/accuracy_test.yaml b/.github/workflows/accuracy_test.yaml
@@ -37,6 +37,7 @@ on:
         # Current supported vLLM versions
         options:
           - main
+          - v0.9.2
           - v0.9.1
           - v0.7.3
       vllm-ascend-version:
@@ -163,7 +164,7 @@ jobs:
           repository: vllm-project/vllm
           path: ./vllm-empty
           # Please also update this when bump matched version
-          ref: ${{ github.event.inputs.vllm-version || 'v0.9.1' }}
+          ref: ${{ github.event.inputs.vllm-version || 'v0.9.2' }}
 
       - name: Install vllm-project/vllm from source
         working-directory: ./vllm-empty
diff --git a/.github/workflows/nightly_benchmarks.yaml b/.github/workflows/nightly_benchmarks.yaml
@@ -50,7 +50,7 @@ jobs:
     strategy:
       matrix:
         include:
-          - vllm_branch: v0.9.1
+          - vllm_branch: v0.9.2
             vllm_ascend_branch: main
             vllm_use_v1: 1
       max-parallel: 1
diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml
@@ -138,13 +138,13 @@ jobs:
     if: ${{ needs.lint.result == 'success' || github.event_name == 'push' }}
     runs-on: ubuntu-latest
     container:
-      image: m.daocloud.io/quay.io/ascend/cann:8.1.rc1-910b-ubuntu22.04-py3.10
+      image: quay.io/ascend/cann:8.1.rc1-910b-ubuntu22.04-py3.10
       env:
         VLLM_LOGGING_LEVEL: ERROR
         VLLM_USE_MODELSCOPE: True
     strategy:
       matrix:
-        vllm_version: [main, v0.9.1]
+        vllm_version: [main, v0.9.2]
     steps:
       - name: Install packages
         run: |
@@ -201,7 +201,7 @@ jobs:
       max-parallel: 2
       matrix:
         os: [linux-arm64-npu-1]
-        vllm_version: [main, v0.9.1]
+        vllm_version: [main, v0.9.2]
     name: singlecard e2e test
     runs-on: ${{ matrix.os }}
     container:
@@ -302,7 +302,7 @@ jobs:
       max-parallel: 1
       matrix:
         os: [linux-arm64-npu-4]
-        vllm_version: [main, v0.9.1]
+        vllm_version: [main, v0.9.2]
     name: multicard e2e test
     runs-on: ${{ matrix.os }}
     container:
diff --git a/.github/workflows/vllm_ascend_test_long_term.yaml b/.github/workflows/vllm_ascend_test_long_term.yaml
@@ -43,7 +43,7 @@ jobs:
       max-parallel: 2
       matrix:
         os: [linux-arm64-npu-1, linux-arm64-npu-4]
-        vllm_version: [main, v0.9.1]
+        vllm_version: [main, v0.9.2]
     name: vLLM Ascend long term test
     runs-on: ${{ matrix.os }}
     container:
diff --git a/Dockerfile b/Dockerfile
@@ -37,7 +37,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_TAG=v0.9.1
+ARG VLLM_TAG=v0.9.2
 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
diff --git a/Dockerfile.310p b/Dockerfile.310p
@@ -37,7 +37,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_TAG=v0.9.1
+ARG VLLM_TAG=v0.9.2
 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
diff --git a/Dockerfile.310p.openEuler b/Dockerfile.310p.openEuler
@@ -34,7 +34,7 @@ COPY . /vllm-workspace/vllm-ascend/
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_TAG=v0.9.1
+ARG VLLM_TAG=v0.9.2
 
 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
diff --git a/Dockerfile.openEuler b/Dockerfile.openEuler
@@ -34,7 +34,7 @@ COPY . /vllm-workspace/vllm-ascend/
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_TAG=v0.9.1
+ARG VLLM_TAG=v0.9.2
 
 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
diff --git a/docs/source/community/versioning_policy.md b/docs/source/community/versioning_policy.md
@@ -74,8 +74,8 @@ Usually, each minor version of vLLM (such as 0.7) will correspond to a vLLM Asce
 
 | Branch     | Status       | Note                                 |
 |------------|--------------|--------------------------------------|
-| main       | Maintained   | CI commitment for vLLM main branch and vLLM 0.9.x branch   |
-| v0.9.1-dev | Maintained   | CI commitment for vLLM 0.9.0 and 0.9.1 version |
+| main       | Maintained   | CI commitment for vLLM main branch and vLLM 0.9.2 branch   |
+| v0.9.1-dev | Maintained   | CI commitment for vLLM 0.9.1 version |
 | v0.7.3-dev | Maintained   | CI commitment for vLLM 0.7.3 version |
 | v0.7.1-dev | Unmaintained | Replaced by v0.7.3-dev               |
 
diff --git a/docs/source/user_guide/graph_mode.md b/docs/source/user_guide/graph_mode.md
@@ -12,7 +12,7 @@ From v0.9.1rc1 with V1 Engine, vLLM Ascend will run models in graph mode by defa
 
 There are two kinds for graph mode supported by vLLM Ascend:
 - **ACLGraph**: This is the default graph mode supported by vLLM Ascend. In v0.9.1rc1, only Qwen series models are well tested.
-- **TorchAirGraph**: This is the GE graph mode. In v0.9.1rc1, only DeepSeek series models are supported. In v0.9.1rc2, we also support PanguProMoe with torchair.
+- **TorchAirGraph**: This is the GE graph mode. In v0.9.1rc1, only DeepSeek series models are supported.
 
 ## Using ACLGraph
 ACLGraph is enabled by default. Take Qwen series models as an example, just set to use V1 Engine is enough.