diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml index 920818864e..027cefa483 100644 --- a/.github/workflows/vllm_ascend_test.yaml +++ b/.github/workflows/vllm_ascend_test.yaml @@ -196,7 +196,8 @@ jobs: e2e: needs: [lint] # only trigger e2e test on pull request after lint passed - if: ${{ needs.lint.result == 'success' && github.event_name == 'pull_request' }} + # if: ${{ needs.lint.result == 'success' && github.event_name == 'pull_request' }} + if: false strategy: max-parallel: 2 matrix: @@ -297,7 +298,8 @@ jobs: e2e-4-cards: needs: [e2e] - if: ${{ needs.e2e.result == 'success' }} + # if: ${{ needs.e2e.result == 'success' }} + if: false strategy: max-parallel: 1 matrix: diff --git a/tests/e2e/multicard/test_deepseek_v2_lite_tp2_accuracy.py b/tests/e2e/multicard/test_deepseek_v2_lite_tp2_accuracy.py index 3a9068ff6b..3263f4c3b1 100644 --- a/tests/e2e/multicard/test_deepseek_v2_lite_tp2_accuracy.py +++ b/tests/e2e/multicard/test_deepseek_v2_lite_tp2_accuracy.py @@ -34,11 +34,11 @@ # 3% relative tolerance for numerical accuracy. RTOL = 0.03 # Baseline accuracy after VLLM optimization. -EXPECTED_VALUE = 0.3843821076573162 +EXPECTED_VALUE = 0.6557998483699773 def run_test(model_name, queue, more_args=None): - model_args = f"pretrained={model_name},max_model_len=4096,trust_remote_code=True,tensor_parallel_size=4,enforce_eager=True" + model_args = f"pretrained={model_name},max_model_len=4096,trust_remote_code=True,tensor_parallel_size=2,enforce_eager=True" if more_args is not None: model_args = f"{model_args},{more_args}" results = lm_eval.simple_evaluate( diff --git a/tests/e2e/multicard/test_fused_moe_allgather_ep.py b/tests/e2e/multicard/test_fused_moe_allgather_ep.py index ad755dd161..0747bc3e4d 100644 --- a/tests/e2e/multicard/test_fused_moe_allgather_ep.py +++ b/tests/e2e/multicard/test_fused_moe_allgather_ep.py @@ -36,12 +36,11 @@ "TASK_QUEUE_ENABLE": "1", "VLLM_ENABLE_FUSED_EXPERTS_ALLGATHER_EP": "1" }) -def test_generate_with_allgather(): - example_prompts = ["Hello, my name is"] +def test_generate_with_allgather(example_prompts): sampling_params = SamplingParams(max_tokens=100, temperature=0.0) with VllmRunner(snapshot_download("vllm-ascend/DeepSeek-V3-Pruning"), - tensor_parallel_size=4, + tensor_parallel_size=2, enforce_eager=True, max_model_len=1024, dtype="auto", @@ -62,12 +61,11 @@ def test_generate_with_allgather(): "VLLM_WORKER_MULTIPROC_METHOD": "spawn", "TASK_QUEUE_ENABLE": "1" }) -def test_generate_with_alltoall(): - example_prompts = ["Hello, my name is"] +def test_generate_with_alltoall(example_prompts): sampling_params = SamplingParams(max_tokens=100, temperature=0.0) with VllmRunner(snapshot_download("vllm-ascend/DeepSeek-V3-Pruning"), - tensor_parallel_size=4, + tensor_parallel_size=2, enforce_eager=True, max_model_len=1024, dtype="auto", diff --git a/tests/e2e/multicard/test_offline_inference_distributed.py b/tests/e2e/multicard/test_offline_inference_distributed.py index 47ff47eddd..8fa5ae7db1 100644 --- a/tests/e2e/multicard/test_offline_inference_distributed.py +++ b/tests/e2e/multicard/test_offline_inference_distributed.py @@ -23,7 +23,7 @@ import os from unittest.mock import patch -from modelscope import snapshot_download # type: ignore +import pytest from vllm import SamplingParams from vllm.model_executor.models.registry import ModelRegistry @@ -32,98 +32,27 @@ os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256" -def test_models_distributed_QwQ(): - example_prompts = [ - "Hello, my name is", - ] +@pytest.mark.parametrize("distributed_executor_backend", ["mp", "ray"]) +def test_models_distributed_QwQ(example_prompts, distributed_executor_backend): dtype = "half" max_tokens = 5 with VllmRunner( "Qwen/QwQ-32B", dtype=dtype, - tensor_parallel_size=4, - distributed_executor_backend="mp", + tensor_parallel_size=2, + distributed_executor_backend=distributed_executor_backend, ) as vllm_model: vllm_model.generate_greedy(example_prompts, max_tokens) -def test_models_distributed_DeepSeek_multistream_moe(): - example_prompts = [ - "Hello, my name is", - ] - dtype = "half" - max_tokens = 5 - with VllmRunner( - "vllm-ascend/DeepSeek-V3-Pruning", - dtype=dtype, - tensor_parallel_size=4, - distributed_executor_backend="mp", - additional_config={ - "torchair_graph_config": { - "enabled": True, - "enable_multistream_moe": True, - }, - "ascend_scheduler_config": { - "enabled": True, - }, - "refresh": True, - }, - enforce_eager=False, - ) as vllm_model: - vllm_model.generate_greedy(example_prompts, max_tokens) - - -@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_TOPK_OPTIMIZE": "1"}) -def test_models_distributed_topk() -> None: - example_prompts = [ - "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.", - "Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.", - "Compare and contrast artificial intelligence with human intelligence in terms of processing information.", - ] - dtype = "half" - sampling_params = SamplingParams(max_tokens=5, - temperature=0.0, - top_k=50, - top_p=0.9) - - with VllmRunner( - "deepseek-ai/DeepSeek-V2-Lite", - dtype=dtype, - tensor_parallel_size=4, - distributed_executor_backend="mp", - ) as vllm_model: - vllm_model.generate(example_prompts, sampling_params) - - -@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_DBO": "1"}) -def test_models_distributed_DeepSeek_dbo(): - example_prompts = ["The president of the United States is"] * 41 - dtype = "half" - sampling_params = SamplingParams(max_tokens=100, temperature=0.0) - with VllmRunner( - "deepseek-ai/DeepSeek-V2-Lite", - dtype=dtype, - tensor_parallel_size=4, - distributed_executor_backend="mp", - ) as vllm_model: - model_arch = 'DeepseekV2ForCausalLM' - registed_models = ModelRegistry.models - assert registed_models[ - model_arch].module_name == "vllm_ascend.models.deepseek_dbo" - assert registed_models[ - model_arch].class_name == "CustomDeepseekDBOForCausalLM" - vllm_model.generate(example_prompts, sampling_params) - - @patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_DBO": "1"}) -def test_models_distributed_DeepSeekV3_dbo(): - example_prompts = ["The president of the United States is"] * 41 +def test_models_distributed_DeepSeekV3_dbo(example_prompts): dtype = "half" sampling_params = SamplingParams(max_tokens=100, temperature=0.0) with VllmRunner( "vllm-ascend/DeepSeek-V3-Pruning", dtype=dtype, - tensor_parallel_size=4, + tensor_parallel_size=2, distributed_executor_backend="mp", ) as vllm_model: model_arch = 'DeepseekV3ForCausalLM' @@ -133,37 +62,3 @@ def test_models_distributed_DeepSeekV3_dbo(): assert registed_models[ model_arch].class_name == "CustomDeepseekDBOForCausalLM" vllm_model.generate(example_prompts, sampling_params) - - -def test_models_distributed_DeepSeek_W8A8(): - example_prompts = [ - "Hello, my name is", - ] - max_tokens = 5 - - with VllmRunner( - snapshot_download("vllm-ascend/DeepSeek-V2-Lite-W8A8"), - max_model_len=8192, - enforce_eager=True, - dtype="auto", - tensor_parallel_size=4, - quantization="ascend", - ) as vllm_model: - vllm_model.generate_greedy(example_prompts, max_tokens) - - -def test_models_distributed_pangu(): - example_prompts = [ - "Hello, my name is", - ] - max_tokens = 5 - - with VllmRunner( - snapshot_download("vllm-ascend/pangu-pro-moe-pruing"), - max_model_len=8192, - enforce_eager=True, - dtype="auto", - tensor_parallel_size=4, - distributed_executor_backend="mp", - ) as vllm_model: - vllm_model.generate_greedy(example_prompts, max_tokens) diff --git a/tests/e2e/multicard/test_prefix_caching.py b/tests/e2e/singlecard/core/test_prefix_caching.py similarity index 97% rename from tests/e2e/multicard/test_prefix_caching.py rename to tests/e2e/singlecard/core/test_prefix_caching.py index 368d3ff953..deb4e4b04c 100644 --- a/tests/e2e/multicard/test_prefix_caching.py +++ b/tests/e2e/singlecard/core/test_prefix_caching.py @@ -68,7 +68,6 @@ def test_prefix_cache_with_v1_scheduler(model: str, max_tokens: int) -> None: with VllmRunner(model, enforce_eager=True, max_model_len=2048, - tensor_parallel_size=2, gpu_memory_utilization=0.7) as vllm_model: prefix_cache_output = vllm_model.generate_greedy( INPUT_PROMPTS, max_tokens) @@ -77,7 +76,6 @@ def test_prefix_cache_with_v1_scheduler(model: str, max_tokens: int) -> None: enable_prefix_caching=False, enforce_eager=True, max_model_len=2048, - tensor_parallel_size=2, gpu_memory_utilization=0.7) as vllm_model: vllm_output = vllm_model.generate_greedy(INPUT_PROMPTS, max_tokens) @@ -104,7 +102,6 @@ def test_prefix_cache_with_ascend_scheduler(model: str, }, enforce_eager=True, max_model_len=2048, - tensor_parallel_size=2, gpu_memory_utilization=0.7) as vllm_model: vllm_output = vllm_model.generate_greedy(INPUT_PROMPTS, max_tokens) @@ -117,7 +114,6 @@ def test_prefix_cache_with_ascend_scheduler(model: str, }, enforce_eager=True, max_model_len=2048, - tensor_parallel_size=2, gpu_memory_utilization=0.7) as vllm_model: prefix_cache_output = vllm_model.generate_greedy( INPUT_PROMPTS, max_tokens) @@ -132,7 +128,6 @@ def test_prefix_cache_with_ascend_scheduler(model: str, }, enforce_eager=True, max_model_len=2048, - tensor_parallel_size=2, gpu_memory_utilization=0.7) as vllm_model: chunk_prefill_prefix_cache_output = vllm_model.generate_greedy( INPUT_PROMPTS, max_tokens) diff --git a/tests/e2e/singlecard/quant/test_w8a8.py b/tests/e2e/singlecard/quant/test_w8a8.py new file mode 100644 index 0000000000..94b4d1169f --- /dev/null +++ b/tests/e2e/singlecard/quant/test_w8a8.py @@ -0,0 +1,41 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# + +import pytest + +from tests.conftest import VllmRunner + +MODELS = [ + "vllm-ascend/DeepSeek-V2-Lite-W8A8", + "vllm-ascend/Qwen2.5-0.5B-Instruct-W8A8" +] + + +@pytest.mark.parametrize("model", MODELS) +def test_quant_W8A8(example_prompts, model): + max_tokens = 5 + + with VllmRunner( + model, + max_model_len=8192, + enforce_eager=True, + dtype="auto", + gpu_memory_utilization=0.7, + quantization="ascend", + ) as vllm_model: + vllm_model.generate_greedy(example_prompts, max_tokens) diff --git a/tests/e2e/singlecard/sample/test_e2e_with_topk.py b/tests/e2e/singlecard/sample/test_e2e_with_topk.py new file mode 100644 index 0000000000..230459fe43 --- /dev/null +++ b/tests/e2e/singlecard/sample/test_e2e_with_topk.py @@ -0,0 +1,49 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# + +import os +from unittest.mock import patch + +import pytest +from vllm import SamplingParams + +from tests.conftest import VllmRunner + +MODELS = ["deepseek-ai/DeepSeek-V2-Lite", "Qwen/Qwen2.5-0.5B-Instruct"] + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", ["half", "float16"]) +@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_TOPK_OPTIMIZE": "1"}) +def test_models_distributed_topk(model, dtype) -> None: + example_prompts = [ + "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.", + "Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.", + "Compare and contrast artificial intelligence with human intelligence in terms of processing information.", + ] + sampling_params = SamplingParams(max_tokens=5, + temperature=0.0, + top_k=50, + top_p=0.9) + + with VllmRunner( + model, + dtype=dtype, + gpu_memory_utilization=0.7, + ) as vllm_model: + vllm_model.generate(example_prompts, sampling_params) diff --git a/tests/e2e/singlecard/test_sampler.py b/tests/e2e/singlecard/sample/test_sampler.py similarity index 100% rename from tests/e2e/singlecard/test_sampler.py rename to tests/e2e/singlecard/sample/test_sampler.py diff --git a/tests/e2e/singlecard/test_chunked.py b/tests/e2e/singlecard/test_chunked_mla.py similarity index 92% rename from tests/e2e/singlecard/test_chunked.py rename to tests/e2e/singlecard/test_chunked_mla.py index 2240b88e2c..4eb351bd61 100644 --- a/tests/e2e/singlecard/test_chunked.py +++ b/tests/e2e/singlecard/test_chunked_mla.py @@ -34,13 +34,13 @@ @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("max_tokens", [1]) def test_models( + example_prompts, model: str, max_tokens: int, monkeypatch: pytest.MonkeyPatch, ) -> None: return with monkeypatch.context() as m: - prompts = "The president of the United States is" m.setenv("VLLM_USE_V1", "1") @@ -52,7 +52,7 @@ def test_models( vllm_model = LLM(model, long_prefill_token_threshold=4, enforce_eager=True) - output_chunked = vllm_model.generate(prompts, sampling_params) + output_chunked = vllm_model.generate(example_prompts, sampling_params) logprobs_chunked = output_chunked.outputs[0].logprobs del vllm_model torch.npu.empty_cache() @@ -64,7 +64,7 @@ def test_models( 'enabled': True }, }) - output = vllm_model.generate(prompts, sampling_params) + output = vllm_model.generate(example_prompts, sampling_params) logprobs = output.outputs[0].logprobs del vllm_model torch.npu.empty_cache() diff --git a/tests/e2e/singlecard/test_deepseek_multistream.py b/tests/e2e/singlecard/test_deepseek_multistream.py new file mode 100644 index 0000000000..3872221cc6 --- /dev/null +++ b/tests/e2e/singlecard/test_deepseek_multistream.py @@ -0,0 +1,40 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# + +from tests.conftest import VllmRunner + + +def test_models_distributed_DeepSeek_multistream_moe(example_prompts): + dtype = "half" + max_tokens = 5 + with VllmRunner( + "vllm-ascend/DeepSeek-V3-Pruning", + dtype=dtype, + additional_config={ + "torchair_graph_config": { + "enabled": True, + "enable_multistream_moe": True, + }, + "ascend_scheduler_config": { + "enabled": True, + }, + "refresh": True, + }, + enforce_eager=False, + ) as vllm_model: + vllm_model.generate_greedy(example_prompts, max_tokens) diff --git a/tests/e2e/singlecard/test_offline_inference.py b/tests/e2e/singlecard/test_offline_inference.py index de69612279..4cc8f50718 100644 --- a/tests/e2e/singlecard/test_offline_inference.py +++ b/tests/e2e/singlecard/test_offline_inference.py @@ -21,26 +21,18 @@ Run `pytest tests/test_offline_inference.py`. """ import os -from unittest.mock import patch import pytest -import vllm # noqa: F401 -from modelscope import snapshot_download # type: ignore[import-untyped] -from vllm import SamplingParams from vllm.assets.image import ImageAsset -import vllm_ascend # noqa: F401 from tests.conftest import VllmRunner MODELS = [ - "Qwen/Qwen2.5-0.5B-Instruct", - "Qwen/Qwen3-0.6B-Base", + "Qwen/Qwen2.5-0.5B-Instruct", "Qwen/Qwen3-0.6B-Base", + "vllm-ascend/pangu-pro-moe-pruing" ] MULTIMODALITY_MODELS = ["Qwen/Qwen2.5-VL-3B-Instruct"] -QUANTIZATION_MODELS = [ - "vllm-ascend/Qwen2.5-0.5B-Instruct-W8A8", -] os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256" @@ -63,27 +55,6 @@ def test_models(model: str, dtype: str, max_tokens: int) -> None: vllm_model.generate_greedy(example_prompts, max_tokens) -@pytest.mark.parametrize("model", QUANTIZATION_MODELS) -@pytest.mark.parametrize("max_tokens", [5]) -def test_quantization_models(model: str, max_tokens: int) -> None: - prompt = "The following numbers of the sequence " + ", ".join( - str(i) for i in range(1024)) + " are:" - example_prompts = [prompt] - - # NOTE: Using quantized model repo id from modelscope encounters an issue, - # this pr (https://github.com/vllm-project/vllm/pull/19212) fix the issue, - # after it is being merged, there's no need to download model explicitly. - model_path = snapshot_download(model) - - with VllmRunner(model_path, - max_model_len=8192, - enforce_eager=True, - dtype="auto", - gpu_memory_utilization=0.7, - quantization="ascend") as vllm_model: - vllm_model.generate_greedy(example_prompts, max_tokens) - - @pytest.mark.parametrize("model", MULTIMODALITY_MODELS) def test_multimodal(model, prompt_template, vllm_runner): image = ImageAsset("cherry_blossom") \ @@ -106,24 +77,3 @@ def test_multimodal(model, prompt_template, vllm_runner): vllm_model.generate_greedy(prompts=prompts, images=images, max_tokens=64) - - -@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_TOPK_OPTIMIZE": "1"}) -def test_models_topk() -> None: - example_prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", - ] - sampling_params = SamplingParams(max_tokens=5, - temperature=0.0, - top_k=50, - top_p=0.9) - - with VllmRunner("Qwen/Qwen2.5-0.5B-Instruct", - max_model_len=8192, - dtype="float16", - enforce_eager=True, - gpu_memory_utilization=0.7) as vllm_model: - vllm_model.generate(example_prompts, sampling_params) diff --git a/tests/e2e/multicard/test_torchair_graph_mode.py b/tests/e2e/singlecard/test_torchair_graph_mode.py similarity index 94% rename from tests/e2e/multicard/test_torchair_graph_mode.py rename to tests/e2e/singlecard/test_torchair_graph_mode.py index ce628f9d35..d17a4e695d 100644 --- a/tests/e2e/multicard/test_torchair_graph_mode.py +++ b/tests/e2e/singlecard/test_torchair_graph_mode.py @@ -31,8 +31,6 @@ def _deepseek_torchair_test_fixture( additional_config: Dict, - *, - tensor_parallel_size=4, ): example_prompts = [ "Hello, my name is", @@ -53,8 +51,6 @@ def _deepseek_torchair_test_fixture( with VllmRunner( "vllm-ascend/DeepSeek-V3-Pruning", dtype="half", - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend="mp", enforce_eager=False, additional_config=additional_config, ) as vllm_model: @@ -103,8 +99,6 @@ def test_e2e_deepseekv3_with_torchair_ms_mla(): def _pangu_torchair_test_fixture( additional_config: Dict, - *, - tensor_parallel_size=4, ): example_prompts = [ "Hello, my name is", @@ -125,8 +119,6 @@ def _pangu_torchair_test_fixture( with VllmRunner( "vllm-ascend/pangu-pro-moe-pruing", dtype="half", - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend="mp", enforce_eager=False, additional_config=additional_config, ) as vllm_model: