vllm-project · wangxiyuan · Jul 4, 2025 · Jul 8, 2025
diff --git a/docs/source/developer_guide/feature_guide/patch.md b/docs/source/developer_guide/feature_guide/patch.md
@@ -20,11 +20,11 @@ In `vllm_ascend/patch`, you can see the code structure as follows:
 vllm_ascend
 ├── patch
 │   ├── platform
-│   │   ├── patch_0_9_1
+│   │   ├── patch_0_9_2
 │   │   ├── patch_common
 │   │   ├── patch_main
 │   ├── worker
-│   │   ├── patch_0_9_1
+│   │   ├── patch_0_9_2
 │   │   ├── patch_common
 │   │   ├── patch_main
 └───────────
@@ -38,15 +38,15 @@ vllm_ascend
 
 In both **platform** and **worker** folder, there are several patch modules. They are used for patching different version of vLLM.
 
-- `patch_0_9_1`: This module is used for patching vLLM 0.9.1. The version is always the nearest version of vLLM. Once vLLM is released, we will drop this patch module and bump to a new version. For example, `patch_0_9_2` is used for patching vLLM 0.9.2.
+- `patch_0_9_2`: This module is used for patching vLLM 0.9.2. The version is always the nearest version of vLLM. Once vLLM is released, we will drop this patch module and bump to a new version. For example, `patch_0_9_2` is used for patching vLLM 0.9.2.
 - `patch_main`: This module is used for patching the code in vLLM main branch.
-- `patch_common`: This module is used for patching both vLLM 0.9.1 and vLLM main branch.
+- `patch_common`: This module is used for patching both vLLM 0.9.2 and vLLM main branch.
 
 ## How to write a patch
 
 Before writing a patch, following the principle above, we should patch the least code. If it's necessary, we can patch the code in either **platform** and **worker** folder. Here is an example to patch `distributed` module in vLLM.
 
-1. Decide which version of vLLM we should patch. For example, after analysis, here we want to patch both 0.9.1 and main of vLLM.
+1. Decide which version of vLLM we should patch. For example, after analysis, here we want to patch both 0.9.2 and main of vLLM.
 2. Decide which process we should patch. For example, here `distributed` belongs to the vLLM main process, so we should patch `platform`.
 3. Create the patch file in the right folder. The file should be named as `patch_{module_name}.py`. The example here is `vllm_ascend/patch/platform/patch_common/patch_distributed.py`.
 4. Write your patch code in the new file. Here is an example:
@@ -79,4 +79,4 @@ Before writing a patch, following the principle above, we should patch the least
 
 ## Limitation
 1. In V1 Engine, vLLM starts three kinds of process: Main process, EngineCore process and Worker process. Now vLLM Ascend only support patch the code in Main process and Worker process by default. If you want to patch the code runs in EngineCore process, you should patch EngineCore process entirely during setup, the entry code is here `vllm.v1.engine.core`. Please override `EngineCoreProc` and `DPEngineCoreProc` entirely.
-2. If you are running an edited vLLM code, the version of the vLLM may be changed automatically. For example, if you runs an edited vLLM based on v0.9.1, the version of vLLM may be change to v0.9.2xxx, in this case, the patch for v0.9.1 in vLLM Ascend would not work as expect, because that vLLM Ascend can't distinguish the version of vLLM you're using. In this case, you can set the environment variable `VLLM_VERSION` to specify the version of vLLM you're using, then the patch for v0.9.1 should work.
+2. If you are running an edited vLLM code, the version of the vLLM may be changed automatically. For example, if you runs an edited vLLM based on v0.9.2, the version of vLLM may be change to v0.9.2xxx, in this case, the patch for v0.9.2 in vLLM Ascend would not work as expect, because that vLLM Ascend can't distinguish the version of vLLM you're using. In this case, you can set the environment variable `VLLM_VERSION` to specify the version of vLLM you're using, then the patch for v0.9.2 should work.
diff --git a/tests/e2e/multicard/test_offline_inference_distributed.py b/tests/e2e/multicard/test_offline_inference_distributed.py
@@ -73,28 +73,6 @@ def test_models_distributed_DeepSeek_multistream_moe():
         vllm_model.generate_greedy(example_prompts, max_tokens)
 
 
-@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_TOPK_OPTIMIZE": "1"})
-def test_models_distributed_topk() -> None:
-    example_prompts = [
-        "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.",
-        "Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.",
-        "Compare and contrast artificial intelligence with human intelligence in terms of processing information.",
-    ]
-    dtype = "half"
-    sampling_params = SamplingParams(max_tokens=5,
-                                     temperature=0.0,
-                                     top_k=50,
-                                     top_p=0.9)
-
-    with VllmRunner(
-            "deepseek-ai/DeepSeek-V2-Lite",
-            dtype=dtype,
-            tensor_parallel_size=4,
-            distributed_executor_backend="mp",
-    ) as vllm_model:
-        vllm_model.generate(example_prompts, sampling_params)
-
-
 @patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_DBO": "1"})
 def test_models_distributed_DeepSeek_dbo():
     example_prompts = ["The president of the United States is"] * 41

diff --git a/tests/e2e/singlecard/core/ascend_scheduler/test_ascend_scheduler.py b/tests/e2e/singlecard/core/ascend_scheduler/test_ascend_scheduler.py
@@ -16,7 +16,6 @@
 from vllm.v1.structured_output import StructuredOutputManager
 
 from vllm_ascend.core.scheduler import AscendScheduler
-from vllm_ascend.utils import vllm_version_is
 
 EOS_TOKEN_ID = 50256
 
@@ -140,9 +139,7 @@ def create_requests(num_requests: int,
             multi_modal_placeholders=mm_position,
             multi_modal_hashes=None,
             eos_token_id=EOS_TOKEN_ID,
-            **({
-                "pooling_params": None
-            } if not vllm_version_is("0.9.1") else {}),
+            pooling_params={},
         )
         requests.append(request)
     return requests
@@ -201,10 +198,7 @@ def test_schedule(enable_prefix_caching: Optional[bool],
     # Test initial scheduling
     output = scheduler.schedule()
     assert len(output.scheduled_new_reqs) == len(requests)
-    if vllm_version_is("0.9.1"):
-        assert len(output.scheduled_cached_reqs) == 0
-    else:
-        assert output.scheduled_cached_reqs.num_reqs == 0
+    assert output.scheduled_cached_reqs.num_reqs == 0
     assert len(output.finished_req_ids) == 0
     # Verify all requests are scheduled.
     for req_id, num_tokens in output.num_scheduled_tokens.items():
@@ -241,10 +235,7 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):
 
     output = scheduler.schedule()
     assert len(output.scheduled_new_reqs) == 3
-    if vllm_version_is("0.9.1"):
-        assert len(output.scheduled_cached_reqs) == 0
-    else:
-        assert output.scheduled_cached_reqs.num_reqs == 0
+    assert output.scheduled_cached_reqs.num_reqs == 0
     assert len(output.finished_req_ids) == 0
 
     # The first request is scheduled partially - 400.
@@ -264,20 +255,15 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):
         spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
-        **({
-            "pooler_output": []
-        } if not vllm_version_is("0.9.1") else {}))
+        pooler_output=[])
     scheduler.update_from_output(output, model_runner_output)
 
     # Schedule the next step. All three requests are running.
     # Processed the remaining prefills of the first and second requests.
     output1 = scheduler.schedule()
     assert len(scheduler.running) == 3
     assert len(output1.scheduled_new_reqs) == 0
-    if vllm_version_is("0.9.1"):
-        assert len(output1.scheduled_cached_reqs) == 3
-    else:
-        assert output1.scheduled_cached_reqs.num_reqs == 3
+    assert output1.scheduled_cached_reqs.num_reqs == 3
     assert len(output1.finished_req_ids) == 0
     assert output1.num_scheduled_tokens[requests[0].request_id] == 400
     assert output1.num_scheduled_tokens[requests[1].request_id] == 400
@@ -293,18 +279,13 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):
         spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
-        **({
-            "pooler_output": []
-        } if not vllm_version_is("0.9.1") else {}))
+        pooler_output=[])
 
     scheduler.update_from_output(output1, model_runner_output)
     output2 = scheduler.schedule()
     assert len(scheduler.running) == 3
     assert len(output2.scheduled_new_reqs) == 0
-    if vllm_version_is("0.9.1"):
-        assert len(output2.scheduled_cached_reqs) == 3
-    else:
-        assert output2.scheduled_cached_reqs.num_reqs == 3
+    assert output2.scheduled_cached_reqs.num_reqs == 3
     assert len(output2.finished_req_ids) == 0
     assert output2.num_scheduled_tokens[requests[0].request_id] == 1
     assert output2.num_scheduled_tokens[requests[1].request_id] == 1
@@ -351,9 +332,7 @@ def test_stop_via_update_from_output():
         spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
-        **({
-            "pooler_output": []
-        } if not vllm_version_is("0.9.1") else {}))
+        pooler_output=[])
 
     scheduler.update_from_output(scheduler_output, model_output)
 
@@ -402,9 +381,7 @@ def test_stop_via_update_from_output():
         spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
-        **({
-            "pooler_output": []
-        } if not vllm_version_is("0.9.1") else {}))
+        pooler_output=[])
 
     scheduler.update_from_output(scheduler_output, model_output)
 
@@ -452,9 +429,7 @@ def test_stop_via_update_from_output():
         spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
-        **({
-            "pooler_output": []
-        } if not vllm_version_is("0.9.1") else {}))
+        pooler_output=[])
 
     scheduler.update_from_output(scheduler_output, model_output)
 
@@ -497,9 +472,7 @@ def test_stop_via_update_from_output():
         spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
-        **({
-            "pooler_output": []
-        } if not vllm_version_is("0.9.1") else {}))
+        pooler_output=[])
 
     scheduler.update_from_output(scheduler_output, model_output)
 
@@ -549,9 +522,7 @@ def test_schedule_concurrent_batches(enable_prefix_caching: Optional[bool],
         spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
-        **({
-            "pooler_output": []
-        } if not vllm_version_is("0.9.1") else {}))
+        pooler_output=[])
 
     scheduler.update_from_output(scheduler_output0, model_runner_output)
 
@@ -569,9 +540,7 @@ def test_schedule_concurrent_batches(enable_prefix_caching: Optional[bool],
         spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
-        **({
-            "pooler_output": []
-        } if not vllm_version_is("0.9.1") else {}))
+        pooler_output=[])
 
     scheduler.update_from_output(scheduler_output1, model_runner_output)
 
@@ -622,9 +591,7 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected):
         spec_token_ids=spec_tokens,
         logprobs=None,
         prompt_logprobs_dict={},
-        **({
-            "pooler_output": []
-        } if not vllm_version_is("0.9.1") else {}))
+        pooler_output=[])
 
     engine_core_outputs = scheduler.update_from_output(output,
                                                        model_runner_output)
@@ -657,16 +624,13 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected):
         else:
             assert req_id not in output.scheduled_spec_decode_tokens
 
-    model_runner_output = ModelRunnerOutput(
-        req_ids=req_ids,
-        req_id_to_index=req_to_index,
-        sampled_token_ids=output_tokens,
-        spec_token_ids=None,
-        logprobs=None,
-        prompt_logprobs_dict={},
-        **({
-            "pooler_output": []
-        } if not vllm_version_is("0.9.1") else {}))
+    model_runner_output = ModelRunnerOutput(req_ids=req_ids,
+                                            req_id_to_index=req_to_index,
+                                            sampled_token_ids=output_tokens,
+                                            spec_token_ids=None,
+                                            logprobs=None,
+                                            prompt_logprobs_dict={},
+                                            pooler_output=[])
 
     engine_core_outputs = scheduler.update_from_output(output,
                                                        model_runner_output)
@@ -695,9 +659,7 @@ def make_output(scheduler: AscendScheduler):
         spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
-        **({
-            "pooler_output": []
-        } if not vllm_version_is("0.9.1") else {}))
+        pooler_output=[])
 
 
 def assert_scheduler_empty(scheduler: AscendScheduler):

diff --git a/tests/e2e/singlecard/sample/test_rejection_sampler.py b/tests/e2e/singlecard/sample/test_rejection_sampler.py
@@ -4,12 +4,12 @@
 import pytest
 import torch
 import torch.nn.functional as F
+from vllm.v1.sample.logits_processor import LogitsProcessorManager
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
 
 from vllm_ascend.sample.rejection_sampler import (PLACEHOLDER_TOKEN_ID,
                                                   AscendRejectionSampler)
-from vllm_ascend.utils import vllm_version_is
 
 DEVICE = "npu"
 
@@ -50,46 +50,23 @@ def create_sampling_metadata(
         temperature = None
     else:
         assert temperature is not None
-    if vllm_version_is("0.9.1"):
-        return SamplingMetadata(
-            temperature=temperature,
-            all_greedy=all_greedy,
-            all_random=not all_greedy,
-            top_p=top_p,
-            top_k=top_k,
-            min_p=torch.empty(1, ),
-            generators=generators,
-            max_num_logprobs=0,
-            no_penalties=False,
-            prompt_token_ids=None,
-            frequency_penalties=torch.tensor([]),
-            presence_penalties=torch.tensor([]),
-            repetition_penalties=torch.tensor([]),
-            output_token_ids=[],
-            min_tokens={},
-            logit_bias=[None],
-            allowed_token_ids_mask=None,
-            bad_words_token_ids={},
-        )
-    else:
-        from vllm.v1.sample.logits_processor import LogitsProcessorManager
-
-        return SamplingMetadata(temperature=temperature,
-                                all_greedy=all_greedy,
-                                all_random=not all_greedy,
-                                top_p=top_p,
-                                top_k=top_k,
-                                generators=generators,
-                                max_num_logprobs=0,
-                                no_penalties=False,
-                                prompt_token_ids=None,
-                                frequency_penalties=torch.tensor([]),
-                                presence_penalties=torch.tensor([]),
-                                repetition_penalties=torch.tensor([]),
-                                output_token_ids=[],
-                                allowed_token_ids_mask=None,
-                                bad_words_token_ids={},
-                                logitsprocs=LogitsProcessorManager())
+
+    return SamplingMetadata(temperature=temperature,
+                            all_greedy=all_greedy,
+                            all_random=not all_greedy,
+                            top_p=top_p,
+                            top_k=top_k,
+                            generators=generators,
+                            max_num_logprobs=0,
+                            no_penalties=False,
+                            prompt_token_ids=None,
+                            frequency_penalties=torch.tensor([]),
+                            presence_penalties=torch.tensor([]),
+                            repetition_penalties=torch.tensor([]),
+                            output_token_ids=[],
+                            allowed_token_ids_mask=None,
+                            bad_words_token_ids={},
+                            logitsprocs=LogitsProcessorManager())
 
 
 ########################### Tests for Greedy Sampling ###################

diff --git a/tests/e2e/singlecard/test_embedding.py b/tests/e2e/singlecard/test_embedding.py
@@ -19,12 +19,10 @@
 from collections.abc import Sequence
 from typing import Optional
 
-import pytest
 from modelscope import snapshot_download  # type: ignore[import-untyped]
 
 from tests.conftest import HfRunner
 from tests.utils import check_embeddings_close, matryoshka_fy
-from vllm_ascend.utils import vllm_version_is
 
 
 def run_embedding_correctness_test(
@@ -51,8 +49,6 @@ def test_dummy():
     assert True
 
 
-@pytest.mark.skipif(vllm_version_is("0.9.1"),
-                    reason="vLLM 0.9.1 does not support embed task for v1")
 def test_embed_models_correctness(hf_runner, vllm_runner):
     queries = ['What is the capital of China?', 'Explain gravity']