Skip to content

[CI/UT][Refactor] Refactor multi-card CI #1645

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions .github/workflows/vllm_ascend_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,8 @@ jobs:
e2e:
needs: [lint]
# only trigger e2e test on pull request after lint passed
if: ${{ needs.lint.result == 'success' && github.event_name == 'pull_request' }}
# if: ${{ needs.lint.result == 'success' && github.event_name == 'pull_request' }}
if: false
strategy:
max-parallel: 2
matrix:
Expand Down Expand Up @@ -297,7 +298,8 @@ jobs:

e2e-4-cards:
needs: [e2e]
if: ${{ needs.e2e.result == 'success' }}
# if: ${{ needs.e2e.result == 'success' }}
if: false
strategy:
max-parallel: 1
matrix:
Expand Down
4 changes: 2 additions & 2 deletions tests/e2e/multicard/test_deepseek_v2_lite_tp2_accuracy.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,11 @@
# 3% relative tolerance for numerical accuracy.
RTOL = 0.03
# Baseline accuracy after VLLM optimization.
EXPECTED_VALUE = 0.3843821076573162
EXPECTED_VALUE = 0.6557998483699773


def run_test(model_name, queue, more_args=None):
model_args = f"pretrained={model_name},max_model_len=4096,trust_remote_code=True,tensor_parallel_size=4,enforce_eager=True"
model_args = f"pretrained={model_name},max_model_len=4096,trust_remote_code=True,tensor_parallel_size=2,enforce_eager=True"
if more_args is not None:
model_args = f"{model_args},{more_args}"
results = lm_eval.simple_evaluate(
Expand Down
10 changes: 4 additions & 6 deletions tests/e2e/multicard/test_fused_moe_allgather_ep.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,11 @@
"TASK_QUEUE_ENABLE": "1",
"VLLM_ENABLE_FUSED_EXPERTS_ALLGATHER_EP": "1"
})
def test_generate_with_allgather():
example_prompts = ["Hello, my name is"]
def test_generate_with_allgather(example_prompts):
sampling_params = SamplingParams(max_tokens=100, temperature=0.0)

with VllmRunner(snapshot_download("vllm-ascend/DeepSeek-V3-Pruning"),
tensor_parallel_size=4,
tensor_parallel_size=2,
enforce_eager=True,
max_model_len=1024,
dtype="auto",
Expand All @@ -62,12 +61,11 @@ def test_generate_with_allgather():
"VLLM_WORKER_MULTIPROC_METHOD": "spawn",
"TASK_QUEUE_ENABLE": "1"
})
def test_generate_with_alltoall():
example_prompts = ["Hello, my name is"]
def test_generate_with_alltoall(example_prompts):
sampling_params = SamplingParams(max_tokens=100, temperature=0.0)

with VllmRunner(snapshot_download("vllm-ascend/DeepSeek-V3-Pruning"),
tensor_parallel_size=4,
tensor_parallel_size=2,
enforce_eager=True,
max_model_len=1024,
dtype="auto",
Expand Down
119 changes: 7 additions & 112 deletions tests/e2e/multicard/test_offline_inference_distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
import os
from unittest.mock import patch

from modelscope import snapshot_download # type: ignore
import pytest
from vllm import SamplingParams
from vllm.model_executor.models.registry import ModelRegistry

Expand All @@ -32,98 +32,27 @@
os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"


def test_models_distributed_QwQ():
example_prompts = [
"Hello, my name is",
]
@pytest.mark.parametrize("distributed_executor_backend", ["mp", "ray"])
def test_models_distributed_QwQ(example_prompts, distributed_executor_backend):
dtype = "half"
max_tokens = 5
with VllmRunner(
"Qwen/QwQ-32B",
dtype=dtype,
tensor_parallel_size=4,
distributed_executor_backend="mp",
tensor_parallel_size=2,
distributed_executor_backend=distributed_executor_backend,
) as vllm_model:
vllm_model.generate_greedy(example_prompts, max_tokens)


def test_models_distributed_DeepSeek_multistream_moe():
example_prompts = [
"Hello, my name is",
]
dtype = "half"
max_tokens = 5
with VllmRunner(
"vllm-ascend/DeepSeek-V3-Pruning",
dtype=dtype,
tensor_parallel_size=4,
distributed_executor_backend="mp",
additional_config={
"torchair_graph_config": {
"enabled": True,
"enable_multistream_moe": True,
},
"ascend_scheduler_config": {
"enabled": True,
},
"refresh": True,
},
enforce_eager=False,
) as vllm_model:
vllm_model.generate_greedy(example_prompts, max_tokens)


@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_TOPK_OPTIMIZE": "1"})
def test_models_distributed_topk() -> None:
example_prompts = [
"vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.",
"Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.",
"Compare and contrast artificial intelligence with human intelligence in terms of processing information.",
]
dtype = "half"
sampling_params = SamplingParams(max_tokens=5,
temperature=0.0,
top_k=50,
top_p=0.9)

with VllmRunner(
"deepseek-ai/DeepSeek-V2-Lite",
dtype=dtype,
tensor_parallel_size=4,
distributed_executor_backend="mp",
) as vllm_model:
vllm_model.generate(example_prompts, sampling_params)


@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_DBO": "1"})
def test_models_distributed_DeepSeek_dbo():
example_prompts = ["The president of the United States is"] * 41
dtype = "half"
sampling_params = SamplingParams(max_tokens=100, temperature=0.0)
with VllmRunner(
"deepseek-ai/DeepSeek-V2-Lite",
dtype=dtype,
tensor_parallel_size=4,
distributed_executor_backend="mp",
) as vllm_model:
model_arch = 'DeepseekV2ForCausalLM'
registed_models = ModelRegistry.models
assert registed_models[
model_arch].module_name == "vllm_ascend.models.deepseek_dbo"
assert registed_models[
model_arch].class_name == "CustomDeepseekDBOForCausalLM"
vllm_model.generate(example_prompts, sampling_params)


@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_DBO": "1"})
def test_models_distributed_DeepSeekV3_dbo():
example_prompts = ["The president of the United States is"] * 41
def test_models_distributed_DeepSeekV3_dbo(example_prompts):
dtype = "half"
sampling_params = SamplingParams(max_tokens=100, temperature=0.0)
with VllmRunner(
"vllm-ascend/DeepSeek-V3-Pruning",
dtype=dtype,
tensor_parallel_size=4,
tensor_parallel_size=2,
distributed_executor_backend="mp",
) as vllm_model:
model_arch = 'DeepseekV3ForCausalLM'
Expand All @@ -133,37 +62,3 @@ def test_models_distributed_DeepSeekV3_dbo():
assert registed_models[
model_arch].class_name == "CustomDeepseekDBOForCausalLM"
vllm_model.generate(example_prompts, sampling_params)


def test_models_distributed_DeepSeek_W8A8():
example_prompts = [
"Hello, my name is",
]
max_tokens = 5

with VllmRunner(
snapshot_download("vllm-ascend/DeepSeek-V2-Lite-W8A8"),
max_model_len=8192,
enforce_eager=True,
dtype="auto",
tensor_parallel_size=4,
quantization="ascend",
) as vllm_model:
vllm_model.generate_greedy(example_prompts, max_tokens)


def test_models_distributed_pangu():
example_prompts = [
"Hello, my name is",
]
max_tokens = 5

with VllmRunner(
snapshot_download("vllm-ascend/pangu-pro-moe-pruing"),
max_model_len=8192,
enforce_eager=True,
dtype="auto",
tensor_parallel_size=4,
distributed_executor_backend="mp",
) as vllm_model:
vllm_model.generate_greedy(example_prompts, max_tokens)
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,6 @@ def test_prefix_cache_with_v1_scheduler(model: str, max_tokens: int) -> None:
with VllmRunner(model,
enforce_eager=True,
max_model_len=2048,
tensor_parallel_size=2,
gpu_memory_utilization=0.7) as vllm_model:
prefix_cache_output = vllm_model.generate_greedy(
INPUT_PROMPTS, max_tokens)
Expand All @@ -77,7 +76,6 @@ def test_prefix_cache_with_v1_scheduler(model: str, max_tokens: int) -> None:
enable_prefix_caching=False,
enforce_eager=True,
max_model_len=2048,
tensor_parallel_size=2,
gpu_memory_utilization=0.7) as vllm_model:
vllm_output = vllm_model.generate_greedy(INPUT_PROMPTS, max_tokens)

Expand All @@ -104,7 +102,6 @@ def test_prefix_cache_with_ascend_scheduler(model: str,
},
enforce_eager=True,
max_model_len=2048,
tensor_parallel_size=2,
gpu_memory_utilization=0.7) as vllm_model:
vllm_output = vllm_model.generate_greedy(INPUT_PROMPTS, max_tokens)

Expand All @@ -117,7 +114,6 @@ def test_prefix_cache_with_ascend_scheduler(model: str,
},
enforce_eager=True,
max_model_len=2048,
tensor_parallel_size=2,
gpu_memory_utilization=0.7) as vllm_model:
prefix_cache_output = vllm_model.generate_greedy(
INPUT_PROMPTS, max_tokens)
Expand All @@ -132,7 +128,6 @@ def test_prefix_cache_with_ascend_scheduler(model: str,
},
enforce_eager=True,
max_model_len=2048,
tensor_parallel_size=2,
gpu_memory_utilization=0.7) as vllm_model:
chunk_prefill_prefix_cache_output = vllm_model.generate_greedy(
INPUT_PROMPTS, max_tokens)
Expand Down
41 changes: 41 additions & 0 deletions tests/e2e/singlecard/quant/test_w8a8.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#

import pytest

from tests.conftest import VllmRunner

MODELS = [
"vllm-ascend/DeepSeek-V2-Lite-W8A8",
"vllm-ascend/Qwen2.5-0.5B-Instruct-W8A8"
]


@pytest.mark.parametrize("model", MODELS)
def test_quant_W8A8(example_prompts, model):
max_tokens = 5

with VllmRunner(
model,
max_model_len=8192,
enforce_eager=True,
dtype="auto",
gpu_memory_utilization=0.7,
quantization="ascend",
) as vllm_model:
vllm_model.generate_greedy(example_prompts, max_tokens)
49 changes: 49 additions & 0 deletions tests/e2e/singlecard/sample/test_e2e_with_topk.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#

import os
from unittest.mock import patch

import pytest
from vllm import SamplingParams

from tests.conftest import VllmRunner

MODELS = ["deepseek-ai/DeepSeek-V2-Lite", "Qwen/Qwen2.5-0.5B-Instruct"]


@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half", "float16"])
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_TOPK_OPTIMIZE": "1"})
def test_models_distributed_topk(model, dtype) -> None:
example_prompts = [
"vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.",
"Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.",
"Compare and contrast artificial intelligence with human intelligence in terms of processing information.",
]
sampling_params = SamplingParams(max_tokens=5,
temperature=0.0,
top_k=50,
top_p=0.9)

with VllmRunner(
model,
dtype=dtype,
gpu_memory_utilization=0.7,
) as vllm_model:
vllm_model.generate(example_prompts, sampling_params)
Original file line number Diff line number Diff line change
Expand Up @@ -34,13 +34,13 @@
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_tokens", [1])
def test_models(
example_prompts,
model: str,
max_tokens: int,
monkeypatch: pytest.MonkeyPatch,
) -> None:
return
with monkeypatch.context() as m:
prompts = "The president of the United States is"

m.setenv("VLLM_USE_V1", "1")

Expand All @@ -52,7 +52,7 @@ def test_models(
vllm_model = LLM(model,
long_prefill_token_threshold=4,
enforce_eager=True)
output_chunked = vllm_model.generate(prompts, sampling_params)
output_chunked = vllm_model.generate(example_prompts, sampling_params)
logprobs_chunked = output_chunked.outputs[0].logprobs
del vllm_model
torch.npu.empty_cache()
Expand All @@ -64,7 +64,7 @@ def test_models(
'enabled': True
},
})
output = vllm_model.generate(prompts, sampling_params)
output = vllm_model.generate(example_prompts, sampling_params)
logprobs = output.outputs[0].logprobs
del vllm_model
torch.npu.empty_cache()
Expand Down
Loading
Loading