vllm-project
diff --git a/‎.github/workflows/vllm_ascend_test.yaml
Lines changed: 14 additions & 55 deletions b/‎.github/workflows/vllm_ascend_test.yaml
Lines changed: 14 additions & 55 deletions
diff --git a/‎examples/offline_data_parallel.py
Lines changed: 0 additions & 1 deletion b/‎examples/offline_data_parallel.py
Lines changed: 0 additions & 1 deletion
diff --git a/‎examples/offline_dualbatch_overlap_npu.py
Lines changed: 0 additions & 1 deletion b/‎examples/offline_dualbatch_overlap_npu.py
Lines changed: 0 additions & 1 deletion
diff --git a/‎examples/offline_inference_sleep_mode_npu.py
Lines changed: 0 additions & 1 deletion b/‎examples/offline_inference_sleep_mode_npu.py
Lines changed: 0 additions & 1 deletion
diff --git a/‎examples/run_dp_attention_etp16.sh
Lines changed: 0 additions & 1 deletion b/‎examples/run_dp_attention_etp16.sh
Lines changed: 0 additions & 1 deletion
diff --git a/‎requirements-dev.txt
Lines changed: 1 addition & 0 deletions b/‎requirements-dev.txt
Lines changed: 1 addition & 0 deletions
diff --git a/‎requirements-lint.txt
Lines changed: 1 addition & 0 deletions b/‎requirements-lint.txt
Lines changed: 1 addition & 0 deletions
diff --git a/‎tests/conftest.py renamed to ‎tests/e2e/conftest.py
Lines changed: 3 additions & 3 deletions b/‎tests/conftest.py renamed to ‎tests/e2e/conftest.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎tests/model_utils.py renamed to ‎tests/e2e/model_utils.py b/‎tests/model_utils.py renamed to ‎tests/e2e/model_utils.py
diff --git a/‎tests/e2e/multicard/test_fused_moe_allgather_ep.py
Lines changed: 6 additions & 9 deletions b/‎tests/e2e/multicard/test_fused_moe_allgather_ep.py
Lines changed: 6 additions & 9 deletions
@@ -41,16 +41,10 @@ concurrency:
 
 jobs:
   lint:
-    # Only trigger lint on pull request
-    if: ${{ github.event_name == 'pull_request' }}
     uses: ./.github/workflows/pre-commit.yml
 
   changes:
-    # Only trigger changes on pull request
-    if: ${{ github.event_name == 'pull_request' }}
     runs-on: ubuntu-latest
-    permissions:
-      pull-requests: read
     outputs:
       e2e_tracker: ${{ steps.filter.outputs.e2e_tracker }}
       ut_tracker: ${{ steps.filter.outputs.ut_tracker }}
@@ -60,20 +54,24 @@ jobs:
       with:
         filters: |
           e2e_tracker:
+            - '.github/workflows/vllm_ascend_test.yaml'
             - 'vllm_ascend/**'
             - 'csrc/**'
             - 'cmake/**'
             - 'tests/e2e/**'
-            - 'tests/conftest.py'
-            - 'tests/model_utils.py'
-            - 'tests/utils.py'
+            - 'CMakeLists.txt'
+            - 'setup.py'
+            - 'requirements.txt'
+            - 'requirements-dev.txt'
+            - 'requirements-lint.txt'
+            - 'packages.txt'
           ut_tracker:
             - 'tests/ut/**'
   ut:
     needs: [lint, changes]
     name: unit test
-    # only trigger unit test after lint passed and the change is e2e and ut related. Or the PR is merged.
-    if: ${{ github.event_name == 'push' || (needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true')) }}
+    # only trigger unit test after lint passed and the change is e2e and ut related.
+    if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }}
     runs-on: ubuntu-latest
     container:
       image: quay.io/ascend/cann:8.1.rc1-910b-ubuntu22.04-py3.10
@@ -112,9 +110,8 @@ jobs:
           python3 -m pip install -r requirements-dev.txt --extra-index https://download.pytorch.org/whl/cpu/
           python3 -m pip install -v . --extra-index https://download.pytorch.org/whl/cpu/
 
-      - name: Run unit test for V1 Engine
+      - name: Run unit test
         env:
-          VLLM_USE_V1: 1
           VLLM_WORKER_MULTIPROC_METHOD: spawn
           TORCH_DEVICE_BACKEND_AUTOLOAD: 0
         run: |
@@ -133,8 +130,8 @@ jobs:
 
   e2e:
     needs: [lint, changes]
-    # only trigger e2e test after lint passed and the change is e2e related.
-    if: ${{ needs.lint.result == 'success' && needs.changes.outputs.e2e_tracker == 'true' }}
+    # only trigger e2e test after lint passed and the change is e2e related with pull request.
+    if: ${{ github.event_name == 'pull_request' && needs.lint.result == 'success' && needs.changes.outputs.e2e_tracker == 'true' }}
     strategy:
       max-parallel: 2
       matrix:
@@ -189,9 +186,8 @@ jobs:
           pip install -r requirements-dev.txt
           pip install -v -e .
 
-      - name: Run e2e test for V1 Engine
+      - name: Run e2e test
         env:
-          VLLM_USE_V1: 1
           VLLM_WORKER_MULTIPROC_METHOD: spawn
           VLLM_USE_MODELSCOPE: True
         run: |
@@ -213,26 +209,6 @@ jobs:
           # TODO: revert me when test_v1_spec_decode.py::test_ngram_correctness is fixed
           VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py
 
-      - name: Run e2e test on V0 engine
-        if: ${{ github.event_name == 'schedule' }}
-        env:
-          VLLM_USE_V1: 0
-          VLLM_USE_MODELSCOPE: True
-        run: |
-          pytest -sv tests/e2e/singlecard/test_offline_inference.py
-          pytest -sv tests/e2e/singlecard/test_ilama_lora.py
-          pytest -sv tests/e2e/singlecard/test_guided_decoding.py
-          pytest -sv tests/e2e/singlecard/test_camem.py
-          pytest -sv tests/e2e/singlecard/test_prompt_embedding.py
-          pytest -sv tests/e2e/singlecard/test_embedding.py
-          pytest -sv tests/e2e/singlecard/ \
-            --ignore=tests/e2e/singlecard/test_offline_inference.py \
-            --ignore=tests/e2e/singlecard/test_ilama_lora.py \
-            --ignore=tests/e2e/singlecard/test_guided_decoding.py \
-            --ignore=tests/e2e/singlecard/test_camem.py \
-            --ignore=tests/e2e/singlecard/test_prompt_embedding.py \
-            --ignore=tests/e2e/singlecard/test_embedding.py
-
   e2e-4-cards:
     needs: [e2e]
     if: ${{ needs.e2e.result == 'success' }}
@@ -290,9 +266,8 @@ jobs:
           pip install -r requirements-dev.txt
           pip install -v -e .
 
-      - name: Run vllm-project/vllm-ascend test for V1 Engine
+      - name: Run vllm-project/vllm-ascend test
         env:
-          VLLM_USE_V1: 1
           VLLM_WORKER_MULTIPROC_METHOD: spawn
           VLLM_USE_MODELSCOPE: True
         run: |
@@ -308,19 +283,3 @@ jobs:
           pytest -sv tests/e2e/multicard/ --ignore=tests/e2e/multicard/test_ilama_lora_tp2.py \
             --ignore=tests/e2e/multicard/test_offline_inference_distributed.py \
             --ignore=tests/e2e/multicard/test_data_parallel.py
-
-      - name: Run vllm-project/vllm-ascend test on V0 engine
-        if: ${{ github.event_name == 'schedule' }}
-        env:
-          VLLM_USE_V1: 0
-          VLLM_USE_MODELSCOPE: True
-        run: |
-          pytest -sv tests/e2e/multicard/test_ilama_lora_tp2.py
-          # Fixme: run VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py will raise error.
-          # To avoid oom, we need to run the test in a single process.
-          pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ
-          pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W8A8
-          pytest -sv tests/e2e/multicard/test_data_parallel.py
-          pytest -sv tests/e2e/multicard/ --ignore=tests/e2e/multicard/test_ilama_lora_tp2.py \
-            --ignore=tests/e2e/multicard/test_offline_inference_distributed.py \
-            --ignore=tests/e2e/multicard/test_data_parallel.py
@@ -120,7 +120,6 @@ def main(
     trust_remote_code,
 ):
     # DP only support on V1 engine
-    os.environ["VLLM_USE_V1"] = "1"
     os.environ["VLLM_DP_RANK"] = str(global_dp_rank)
     os.environ["VLLM_DP_RANK_LOCAL"] = str(local_dp_rank)
     os.environ["VLLM_DP_SIZE"] = str(dp_size)
 
@@ -5,7 +5,6 @@
 
 # enable dual-batch overlap for vllm ascend
 os.environ["VLLM_ASCEND_ENABLE_DBO"] = "1"
-os.environ["VLLM_USE_V1"] = "1"
 
 # Sample prompts.
 prompts = ["The president of the United States is"] * 41
 
@@ -22,7 +22,6 @@
 from vllm import LLM, SamplingParams
 from vllm.utils import GiB_bytes
 
-os.environ["VLLM_USE_V1"] = "1"
 os.environ["VLLM_USE_MODELSCOPE"] = "True"
 os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
 
 
@@ -1,4 +1,3 @@
-export VLLM_USE_V1=1
 export TASK_QUEUE_ENABLE=1
 source /usr/local/Ascend/ascend-toolkit/set_env.sh
 source /usr/local/Ascend/nnal/atb/set_env.sh
 
@@ -12,4 +12,5 @@ xgrammar
 zmq
 types-psutil
 pytest-cov
+regex
 sentence_transformers
@@ -4,5 +4,6 @@ pre-commit==4.0.1
 # type checking
 mypy==1.11.1
 types-PyYAML
+types-regex
 types-requests
 types-setuptools
@@ -39,8 +39,8 @@
 from vllm.transformers_utils.utils import maybe_model_redirect
 from vllm.utils import is_list_of
 
-from tests.model_utils import (PROMPT_TEMPLATES, TokensTextLogprobs,
-                               TokensTextLogprobsPromptLogprobs)
+from tests.e2e.model_utils import (PROMPT_TEMPLATES, TokensTextLogprobs,
+                                   TokensTextLogprobsPromptLogprobs)
 # TODO: remove this part after the patch merged into vllm, if
 # we not explicitly patch here, some of them might be effectiveless
 # in pytest scenario
@@ -62,7 +62,7 @@
 PromptVideoInput = _PromptMultiModalInput[np.ndarray]
 
 _TEST_DIR = os.path.dirname(__file__)
-_TEST_PROMPTS = [os.path.join(_TEST_DIR, "e2e", "prompts", "example.txt")]
+_TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")]
 
 
 def cleanup_dist_env_and_memory(shutdown_ray: bool = False):
 
@@ -26,12 +26,11 @@
 from modelscope import snapshot_download  # type: ignore
 from vllm import SamplingParams
 
-from tests.conftest import VllmRunner
+from tests.e2e.conftest import VllmRunner
 
 
 @patch.dict(
     os.environ, {
-        "VLLM_USE_V1": "1",
         "VLLM_WORKER_MULTIPROC_METHOD": "spawn",
         "TASK_QUEUE_ENABLE": "1",
         "VLLM_ENABLE_FUSED_EXPERTS_ALLGATHER_EP": "1"
@@ -56,12 +55,10 @@ def test_generate_with_allgather():
         vllm_model.generate(example_prompts, sampling_params)
 
 
-@patch.dict(
-    os.environ, {
-        "VLLM_USE_V1": "1",
-        "VLLM_WORKER_MULTIPROC_METHOD": "spawn",
-        "TASK_QUEUE_ENABLE": "1"
-    })
+@patch.dict(os.environ, {
+    "VLLM_WORKER_MULTIPROC_METHOD": "spawn",
+    "TASK_QUEUE_ENABLE": "1"
+})
 def test_generate_with_alltoall():
     example_prompts = ["Hello, my name is"]
     sampling_params = SamplingParams(max_tokens=100, temperature=0.0)
@@ -79,4 +76,4 @@ def test_generate_with_alltoall():
                         },
                         "expert_tensor_parallel_size": 1
                     }) as vllm_model:
-        vllm_model.generate(example_prompts, sampling_params)
+        vllm_model.generate(example_prompts, sampling_params)
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,3 @@`
`1`		`-export VLLM_USE_V1=1`
`2`	`1`	`export TASK_QUEUE_ENABLE=1`
`3`	`2`	`source /usr/local/Ascend/ascend-toolkit/set_env.sh`
`4`	`3`	`source /usr/local/Ascend/nnal/atb/set_env.sh`