vllm-project · shiyuan680 · Jun 10, 2025 · Jun 10, 2025 · Jun 10, 2025 · Jun 10, 2025
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
@@ -2,9 +2,6 @@ version: 2
 updates:
   - package-ecosystem: "github-actions"
     directory: "/"
-    schedule:
-      # Check for updates to GitHub Actions every week
-      interval: "weekly"
     open-pull-requests-limit: 2
     reviewers:
       - "Yikun"
diff --git a/.github/workflows/nightly_benchmarks.yaml b/.github/workflows/nightly_benchmarks.yaml
@@ -18,11 +18,7 @@
 name: 'Benchmarks / Performance'
 # This workflow runs nightly benchmarks for vllm-ascend.
 
-on:
-  schedule:
-    # Run at 02:00 everyday
-    - cron: '00 18 * * *'
-
+on:  
   workflow_dispatch:
     # Allow manual triggering of the workflow
 

diff --git a/.github/workflows/vllm_ascend_doctest.yaml b/.github/workflows/vllm_ascend_doctest.yaml
@@ -29,9 +29,6 @@ on:
       - 'tests/e2e/doctests/**'
       - 'tests/e2e/common.sh'
       - 'tests/e2e/run_doctests.sh'
-  schedule:
-    # Runs every 4 hours
-    - cron:  '0 */4 * * *'
 
 # Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
 # declared as "shell: bash -el {0}" on steps that need to be properly activated.

diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml
@@ -18,8 +18,6 @@
 name: 'test'
 
 on:
-  schedule:
-    - cron: '0 23 * * *'
   pull_request:
     branches:
       - 'main'
@@ -44,12 +42,6 @@ defaults:
   run:
     shell: bash -el {0}
 
-# only cancel in-progress runs of the same workflow
-# and ignore the lint / 1 card / 4 cards test type
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
 jobs:
   lint:
     runs-on: ubuntu-latest
@@ -114,171 +106,32 @@ jobs:
           echo "::add-matcher::.github/workflows/matchers/mypy.json"
           tools/mypy.sh 1 ${{ matrix.python-version }}
 
-  ut:
-    needs: [lint]
-    name: unit test
-    if: ${{ needs.lint.result == 'success' }}
-    runs-on: ubuntu-latest
-    container:
-      image: m.daocloud.io/quay.io/ascend/cann:8.1.rc1-910b-ubuntu22.04-py3.10
-      env:
-        VLLM_LOGGING_LEVEL: ERROR
-        VLLM_USE_MODELSCOPE: True
-    strategy:
-      matrix:
-        vllm_version: [main, v0.9.1]
-    steps:
-      - name: Install packages
-        run: |
-          apt-get update -y
-          apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev
-
-      - name: Checkout vllm-project/vllm repo
-        uses: actions/checkout@v4
-        with:
-          repository: vllm-project/vllm
-          ref: ${{ matrix.vllm_version }}
-          path: ./vllm-empty
-
-      - name: Install vllm-project/vllm from source
-        working-directory: ./vllm-empty
-        run: |
-          VLLM_TARGET_DEVICE=empty python3 -m pip install . --extra-index https://download.pytorch.org/whl/cpu/
-          python3 -m pip uninstall -y triton
-
-      - name: Checkout vllm-project/vllm-ascend repo
-        uses: actions/checkout@v4
-
-      - name: Install vllm-project/vllm-ascend
-        run: |
-          export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
-          export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/x86_64-linux/devlib
-          python3 -m pip install -r requirements-dev.txt --extra-index https://download.pytorch.org/whl/cpu/
-          python3 -m pip install -v . --extra-index https://download.pytorch.org/whl/cpu/
-
-      - name: Run unit test for V1 Engine
-        env:
-          VLLM_USE_V1: 1
-          VLLM_WORKER_MULTIPROC_METHOD: spawn
-          TORCH_DEVICE_BACKEND_AUTOLOAD: 0
-        run: |
-          export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/x86_64-linux/devlib
-          pytest -sv tests/ut
-
   e2e:
     needs: [lint]
     if: ${{ needs.lint.result == 'success' }}
     strategy:
       max-parallel: 2
       matrix:
-        os: [linux-arm64-npu-1]
-        vllm_version: [main, v0.9.1]
-    name: singlecard e2e test
-    runs-on: ${{ matrix.os }}
-    container:
-      # TODO(yikun): Remove m.daocloud.io prefix when infra proxy ready
-      image: m.daocloud.io/quay.io/ascend/cann:8.1.rc1-910b-ubuntu22.04-py3.10
-      env:
-        VLLM_LOGGING_LEVEL: ERROR
-    steps:
-      - name: Check npu and CANN info
-        run: |
-          npu-smi info
-          cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
-
-      - name: Config mirrors
-        run: |
-          sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
-          pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
-          apt-get update -y
-          apt install git -y
-          git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/
-
-      - name: Checkout vllm-project/vllm-ascend repo
-        uses: actions/checkout@v4
-
-      - name: Install system dependencies
-        run: |
-          apt-get -y install `cat packages.txt`
-          apt-get -y install gcc g++ cmake libnuma-dev
-
-      - name: Checkout vllm-project/vllm repo
-        uses: actions/checkout@v4
-        with:
-          repository: vllm-project/vllm
-          ref: ${{ matrix.vllm_version }}
-          path: ./vllm-empty
-
-      - name: Install vllm-project/vllm from source
-        working-directory: ./vllm-empty
-        run: |
-          VLLM_TARGET_DEVICE=empty pip install -e .
-
-      - name: Install vllm-project/vllm-ascend
-        env:
-          PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi
-        run: |
-          pip install -r requirements-dev.txt
-          pip install -v -e .
-
-      - name: Run e2e test for V1 Engine
-        env:
-          VLLM_USE_V1: 1
-          VLLM_WORKER_MULTIPROC_METHOD: spawn
-          VLLM_USE_MODELSCOPE: True
-        run: |
-          pytest -sv tests/e2e/singlecard/test_offline_inference.py
-          # TODO: switch hf to modelscope
-          VLLM_USE_MODELSCOPE=False HF_ENDPOINT=https://hf-mirror.com \
-            pytest -sv tests/e2e/singlecard/test_ilama_lora.py
-          # TODO(sss): guided decoding doesn't work, fix it later
-          # pytest -sv tests/e2e/singlecard/test_guided_decoding.py
-          pytest -sv tests/e2e/singlecard/test_camem.py
-          pytest -sv tests/e2e/singlecard/ \
-          --ignore=tests/e2e/singlecard/test_offline_inference.py \
-          --ignore=tests/e2e/singlecard/test_ilama_lora.py \
-          --ignore=tests/e2e/singlecard/test_guided_decoding.py \
-          --ignore=tests/e2e/singlecard/test_camem.py
-
-      - name: Run e2e test on V0 engine
-        if: ${{ github.event_name == 'schedule' }}
-        env:
-          VLLM_USE_V1: 0
-          VLLM_USE_MODELSCOPE: True
-        run: |
-          pytest -sv tests/e2e/singlecard/test_offline_inference.py
-          # TODO: switch hf to modelscope
-          VLLM_USE_MODELSCOPE=False HF_ENDPOINT=https://hf-mirror.com \
-            pytest -sv tests/e2e/singlecard/test_ilama_lora.py
-          # guided decoding doesn't work, fix it later
-          # pytest -sv tests/e2e/singlecard/test_guided_decoding.py
-          pytest -sv tests/e2e/singlecard/test_camem.py
-          pytest -sv tests/e2e/singlecard/test_prompt_embedding.py
-          pytest -sv tests/e2e/singlecard/ \
-            --ignore=tests/e2e/singlecard/test_offline_inference.py \
-            --ignore=tests/e2e/singlecard/test_ilama_lora.py \
-            --ignore=tests/e2e/singlecard/test_guided_decoding.py \
-            --ignore=tests/e2e/singlecard/test_camem.py \
-            --ignore=tests/e2e/singlecard/test_prompt_embedding.py \
-            --ignore=tests/e2e/singlecard/core/test_ascend_scheduler.py \
-            --ignore=tests/e2e/singlecard/core/test_ascend_scheduler_e2e.py
-
-  e2e-4-cards:
-    needs: [e2e]
-    if: ${{ needs.e2e.result == 'success' }}
-    strategy:
-      max-parallel: 1
-      matrix:
-        os: [linux-arm64-npu-4]
-        vllm_version: [main, v0.9.1]
-    name: multicard e2e test
+        os: [linux-arm64-npu-1, linux-arm64-npu-4]
+        vllm_version: [v0.9.1]
+    concurrency:
+      group: >
+        ${{
+        matrix.os == 'linux-arm64-npu-4'
+          && github.event.pull_request.number
+          && format('pr-{0}-limit-npu-4', github.event.pull_request.number)
+        || format('job-{0}-{1}-{2}', matrix.os, matrix.vllm_version, github.event.pull_request.number)
+        }}
+      cancel-in-progress: false
+    name: vLLM Ascend test
     runs-on: ${{ matrix.os }}
     container:
       # TODO(yikun): Remove m.daocloud.io prefix when infra proxy ready
       image: m.daocloud.io/quay.io/ascend/cann:8.1.rc1-910b-ubuntu22.04-py3.10
       env:
+        HF_ENDPOINT: https://hf-mirror.com
+        HF_TOKEN: ${{ secrets.HF_TOKEN }}
         VLLM_LOGGING_LEVEL: ERROR
-        VLLM_USE_MODELSCOPE: True
     steps:
       - name: Check npu and CANN info
         run: |
@@ -324,32 +177,64 @@ jobs:
         env:
           VLLM_USE_V1: 1
           VLLM_WORKER_MULTIPROC_METHOD: spawn
-          VLLM_USE_MODELSCOPE: True
         run: |
-          # TODO: switch hf to modelscope
-          VLLM_USE_MODELSCOPE=False HF_ENDPOINT=https://hf-mirror.com \
-            pytest -sv tests/e2e/multicard/test_ilama_lora_tp2.py
-          # Fixme: run VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py will raise error.
-          # To avoid oom, we need to run the test in a single process.
-          pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ
-          pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek
-          pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_topk
-          pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W8A8
-          pytest -sv tests/e2e/multicard/ --ignore=tests/e2e/multicard/test_ilama_lora_tp2.py --ignore=tests/e2e/multicard/test_offline_inference_distributed.py
+          if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]]; then
+            VLLM_USE_MODELSCOPE=True pytest -sv tests/singlecard/test_offline_inference.py
+            # guided decoding doesn't work, fix it later
+            # pytest -sv tests/singlecard/test_guided_decoding.py.py
+            # test_ascend_config.py should be ran separately because it will regenerate the global config many times.
+            pytest -sv tests/singlecard/test_ascend_config.py
+            pytest -sv tests/singlecard/test_camem.py
+            pytest -sv tests/singlecard/core/test_ascend_scheduler.py
+            pytest -sv tests/singlecard/core/test_ascend_scheduler_e2e.py
+            pytest -sv tests/singlecard/ \
+            --ignore=tests/singlecard/test_offline_inference.py \
+            --ignore=tests/singlecard/test_guided_decoding.py \
+            --ignore=tests/singlecard/test_ascend_config.py \
+            --ignore=tests/singlecard/test_camem.py \
+            --ignore=tests/singlecard/core/test_ascend_scheduler.py \
+            --ignore=tests/singlecard/core/test_ascend_scheduler_e2e.py
+          else
+            pytest -sv tests/multicard/test_ilama_lora_tp2.py
+            # To avoid oom, we need to run the test in a single process.
+            VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_w4a8_deepseek.py::test_deepseek_W4A8
+            VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ
+            VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek
+            VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_topk
+            VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W8A8
+            VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_dbo
+            VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeekV3_dbo
+            VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/ --ignore=tests/multicard/test_ilama_lora_tp2.py --ignore=tests/multicard/test_offline_inference_distributed.py --ignore=tests/multicard/test_w4a8_deepseek.py
+          fi
 
       - name: Run vllm-project/vllm-ascend test on V0 engine
         if: ${{ github.event_name == 'schedule' }}
         env:
           VLLM_USE_V1: 0
-          VLLM_USE_MODELSCOPE: True
         run: |
-          # TODO: switch hf to modelscope
-          VLLM_USE_MODELSCOPE=False HF_ENDPOINT=https://hf-mirror.com \
-            pytest -sv tests/e2e/multicard/test_ilama_lora_tp2.py
-          # Fixme: run VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py will raise error.
-          # To avoid oom, we need to run the test in a single process.
-          pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ
-          pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek
-          pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_topk
-          pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W8A8
-          pytest -sv tests/e2e/multicard/ --ignore=tests/e2e/multicard/test_ilama_lora_tp2.py --ignore=tests/e2e/multicard/test_offline_inference_distributed.py
+          if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]]; then
+            VLLM_USE_MODELSCOPE=True  pytest -sv tests/singlecard/test_offline_inference.py
+            # guided decoding doesn't work, fix it later
+            # pytest -sv tests/singlecard/test_guided_decoding.py.py
+            pytest -sv tests/singlecard/test_camem.py
+            # test_ascend_config.py should be ran separately because it will regenerate the global config many times.
+            pytest -sv tests/singlecard/test_ascend_config.py
+            pytest -sv tests/singlecard/test_prompt_embedding.py
+            pytest -sv tests/singlecard/ \
+              --ignore=tests/singlecard/test_offline_inference.py \
+              --ignore=tests/singlecard/test_guided_decoding.py \
+              --ignore=tests/singlecard/test_camem.py \
+              --ignore=tests/singlecard/test_ascend_config.py \
+              --ignore=tests/singlecard/test_prompt_embedding.py \
+              --ignore=tests/singlecard/core/test_ascend_scheduler.py \
+              --ignore=tests/singlecard/core/test_ascend_scheduler_e2e.py
+          else
+            pytest -sv tests/multicard/test_ilama_lora_tp2.py
+            # Fixme: run VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py will raise error.
+            # To avoid oom, we need to run the test in a single process.
+            VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ
+            VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek
+            VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_topk
+            VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W8A8
+            VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/ --ignore=tests/multicard/test_ilama_lora_tp2.py --ignore=tests/multicard/test_offline_inference_distributed.py
+          fi
diff --git a/.github/workflows/vllm_ascend_test_long_term.yaml b/.github/workflows/vllm_ascend_test_long_term.yaml
@@ -17,9 +17,6 @@
 name: 'e2e test / long-term-test'
 
 on:
-  schedule:
-    # Runs at 23:00 UTC (7:00 AM Beijing) every day
-    - cron: '0 23 * * *'
   pull_request:
     types: [ labeled ]
 
@@ -43,7 +40,7 @@ jobs:
       max-parallel: 2
       matrix:
         os: [linux-arm64-npu-1, linux-arm64-npu-4]
-        vllm_version: [main, v0.9.1]
+        vllm_version: [v0.9.1]
     name: vLLM Ascend long term test
     runs-on: ${{ matrix.os }}
     container:
@@ -97,13 +94,17 @@ jobs:
       - name: Run vllm-project/vllm-ascend long term test
         run: |
           if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]]; then
-            # spec decode test
-            VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/long_term/spec_decode/e2e/test_v1_mtp_correctness.py
+            # v0 spec decode test
+            # VLLM_USE_MODELSCOPE=True pytest -sv tests/long_term/spec_decode_v0/e2e/test_mtp_correctness.py  # it needs a clean process
+            # pytest -sv tests/long_term/spec_decode_v0 --ignore=tests/long_term/spec_decode_v0/e2e/test_mtp_correctness.py
+            # v1 spec decode test
+            # TODO: revert me when test_v1_mtp_correctness.py is fixed
+            VLLM_USE_MODELSCOPE=True pytest -sv tests/long_term/spec_decode_v1/test_v1_mtp_correctness.py
             # TODO: revert me when test_v1_spec_decode.py::test_ngram_correctness is fixed
-            # VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/long_term/spec_decode/e2e/test_v1_spec_decode.py
-            VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/long_term/spec_decode/e2e/test_mtp_correctness.py  # it needs a clean process
-            pytest -sv tests/e2e/long_term/spec_decode --ignore=tests/e2e/long_term/spec_decode/e2e/test_mtp_correctness.py --ignore=tests/e2e/long_term/spec_decode/e2e/test_v1_spec_decode.py --ignore=tests/e2e/long_term/spec_decode/e2e/test_v1_mtp_correctness.py
-            pytest -sv tests/e2e/long_term/test_accuracy.py
+            # VLLM_USE_MODELSCOPE=True pytest -sv tests/long_term/spec_decode_v1/test_v1_spec_decode.py
+            # accuracy test single card
+            pytest -sv tests/long_term/test_accuracy.py
           else
-            VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/long_term/test_deepseek_v2_lite_tp2_accuracy.py
+            # accuracy test multi card
+            VLLM_USE_MODELSCOPE=True pytest -sv tests/long_term/test_deepseek_v2_lite_tp2_accuracy.py
           fi
diff --git a/.github/workflows/vllm_ascend_test_pd.yaml b/.github/workflows/vllm_ascend_test_pd.yaml
@@ -17,9 +17,6 @@
 name: 'e2e test / pd-disaggregation'
 
 on:
-  schedule:
-    # Runs at 23:00 UTC (7:00 AM Beijing) every day
-    - cron: '0 23 * * *'
   pull_request:
     types: [ labeled ]
 
@@ -41,7 +38,7 @@ jobs:
     if: ${{ contains(github.event.pull_request.labels.*.name, 'pd-test') && contains(github.event.pull_request.labels.*.name, 'ready-for-test') || github.event_name == 'schedule' }}
     strategy:
       matrix:
-        vllm_verison: [main, v0.9.1]
+        vllm_verison: [v0.9.1]
     name: vLLM Ascend prefilling decoding disaggregation test
     runs-on: linux-arm64-npu-static-8
 
@@ -106,3 +103,7 @@ jobs:
       - name: Run vllm-project/vllm-ascend PD Disaggregation test
         run: |
           pytest -sv tests/e2e/pd_disaggreate/test_pd_e2e.py
+
+      - name: Run vllm-project/vllm-ascend PD Disaggregation edge test
+        run: |
+          bash tests/e2e/pd_disaggreate/run_edge_case_test.sh