Skip to content

Commit 305a0eb

Browse files
author
weijinqian_v1
committed
handle conflict
Signed-off-by: weijinqian_v1 <weijinqian@huawei.com>
2 parents 6f6efc1 + 5559443 commit 305a0eb

29 files changed

+1278
-550
lines changed

.github/workflows/vllm_ascend_test.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,7 @@ jobs:
203203
VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_topk
204204
VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W8A8
205205
VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_dbo
206+
VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_w8a8_ep_dbo
206207
VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeekV3_dbo
207208
VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/ --ignore=tests/multicard/test_ilama_lora_tp2.py --ignore=tests/multicard/test_offline_inference_distributed.py --ignore=tests/multicard/test_w4a8_deepseek.py
208209
fi

.github/workflows/vllm_ascend_test_long_term.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -95,11 +95,11 @@ jobs:
9595
run: |
9696
if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]]; then
9797
# v0 spec decode test
98-
VLLM_USE_MODELSCOPE=True pytest -sv tests/long_term/spec_decode_v0/e2e/test_mtp_correctness.py # it needs a clean process
99-
pytest -sv tests/long_term/spec_decode_v0 --ignore=tests/long_term/spec_decode_v0/e2e/test_mtp_correctness.py
98+
# VLLM_USE_MODELSCOPE=True pytest -sv tests/long_term/spec_decode_v0/e2e/test_mtp_correctness.py # it needs a clean process
99+
# pytest -sv tests/long_term/spec_decode_v0 --ignore=tests/long_term/spec_decode_v0/e2e/test_mtp_correctness.py
100100
# v1 spec decode test
101101
# TODO: revert me when test_v1_mtp_correctness.py is fixed
102-
# VLLM_USE_MODELSCOPE=True pytest -sv tests/long_term/spec_decode_v1/test_v1_mtp_correctness.py
102+
VLLM_USE_MODELSCOPE=True pytest -sv tests/long_term/spec_decode_v1/test_v1_mtp_correctness.py
103103
# TODO: revert me when test_v1_spec_decode.py::test_ngram_correctness is fixed
104104
# VLLM_USE_MODELSCOPE=True pytest -sv tests/long_term/spec_decode_v1/test_v1_spec_decode.py
105105
# accuracy test single card

tests/long_term/spec_decode_v1/test_v1_mtp_correctness.py

Lines changed: 64 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,10 @@ def test_mtp_correctness(
6363
with monkeypatch.context() as m:
6464
m.setenv("VLLM_USE_V1", "1")
6565

66-
ref_llm = LLM(model=model_name, max_model_len=256, enforce_eager=True)
66+
ref_llm = LLM(model=model_name,
67+
max_model_len=256,
68+
gpu_memory_utilization=0.8,
69+
enforce_eager=True)
6770
ref_outputs = ref_llm.chat(test_prompts, sampling_config)
6871
del ref_llm
6972

@@ -74,6 +77,7 @@ def test_mtp_correctness(
7477
"num_speculative_tokens": 1,
7578
},
7679
max_model_len=256,
80+
gpu_memory_utilization=0.8,
7781
enforce_eager=True)
7882
spec_outputs = spec_llm.chat(test_prompts, sampling_config)
7983
matches = 0
@@ -90,3 +94,62 @@ def test_mtp_correctness(
9094
# Upon failure, inspect the outputs to check for inaccuracy.
9195
assert matches > int(0.66 * len(ref_outputs))
9296
del spec_llm
97+
98+
99+
def test_mtp_torchair_correctness(
100+
monkeypatch: pytest.MonkeyPatch,
101+
test_prompts: list[list[dict[str, Any]]],
102+
sampling_config: SamplingParams,
103+
model_name: str,
104+
):
105+
'''
106+
Compare the outputs of a original LLM and a speculative LLM
107+
should be the same when using mtp speculative decoding.
108+
'''
109+
with monkeypatch.context() as m:
110+
m.setenv("VLLM_USE_V1", "1")
111+
112+
ref_llm = LLM(model=model_name,
113+
max_model_len=256,
114+
enforce_eager=False,
115+
additional_config={
116+
"torchair_graph_config": {
117+
"enabled": True
118+
},
119+
"ascend_scheduler_config": {
120+
"enabled": True
121+
},
122+
})
123+
ref_outputs = ref_llm.chat(test_prompts, sampling_config)
124+
del ref_llm
125+
126+
spec_llm = LLM(model=model_name,
127+
trust_remote_code=True,
128+
enforce_eager=False,
129+
speculative_config={
130+
"method": "deepseek_mtp",
131+
"num_speculative_tokens": 1,
132+
},
133+
additional_config={
134+
"torchair_graph_config": {
135+
"enabled": True
136+
},
137+
"ascend_scheduler_config": {
138+
"enabled": True
139+
},
140+
})
141+
spec_outputs = spec_llm.chat(test_prompts, sampling_config)
142+
matches = 0
143+
misses = 0
144+
for ref_output, spec_output in zip(ref_outputs, spec_outputs):
145+
if ref_output.outputs[0].text == spec_output.outputs[0].text:
146+
matches += 1
147+
else:
148+
misses += 1
149+
print(f"ref_output: {ref_output.outputs[0].text}")
150+
print(f"spec_output: {spec_output.outputs[0].text}")
151+
152+
# Heuristic: expect at least 66% of the prompts to match exactly
153+
# Upon failure, inspect the outputs to check for inaccuracy.
154+
assert matches > int(0.66 * len(ref_outputs))
155+
del spec_llm

tests/multicard/test_offline_inference_distributed.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,30 @@ def test_models_distributed_DeepSeek_dbo():
109109
vllm_model.generate(example_prompts, sampling_params)
110110

111111

112+
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_DBO": "1"})
113+
def test_models_distributed_DeepSeek_w8a8_ep_dbo():
114+
example_prompts = ["The president of the United States is"] * 100
115+
sampling_params = SamplingParams(max_tokens=100, temperature=0.0)
116+
with VllmRunner(
117+
snapshot_download("vllm-ascend/DeepSeek-V2-Lite-W8A8"),
118+
dtype="auto",
119+
quantization="ascend",
120+
tensor_parallel_size=4,
121+
enforce_eager=True,
122+
enable_expert_parallel=True,
123+
distributed_executor_backend="mp",
124+
additional_config={"ascend_scheduler_config": {
125+
"enabled": True,
126+
}}) as vllm_model:
127+
model_arch = 'DeepseekV2ForCausalLM'
128+
registed_models = ModelRegistry.models
129+
assert registed_models[
130+
model_arch].module_name == "vllm_ascend.models.deepseek_dbo"
131+
assert registed_models[
132+
model_arch].class_name == "CustomDeepseekDBOForCausalLM"
133+
vllm_model.generate(example_prompts, sampling_params)
134+
135+
112136
@pytest.mark.skip(reason="Due to OOM,waiting for 1311pr to merge in")
113137
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_DBO": "1"})
114138
def test_models_distributed_DeepSeekV3_dbo():

tests/multicard/test_torchair_graph_mode.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -71,10 +71,10 @@ def test_e2e_deepseekv3_with_torchair(monkeypatch: pytest.MonkeyPatch,
7171
# inaccurate. This will only change if accuracy improves with the
7272
# official weights of DeepSeek-V3.
7373
golden_results = [
74-
'Hello, my name is feasibility伸 spazio debtor添',
75-
'The president of the United States is begg"""\n杭州风和 bestimm',
76-
'The capital of France is frequentlyশามalinkAllowed',
77-
'The future of AI is deleting俯احت怎么样了حراف',
74+
'Hello, my name is下载早点向前很有่อง',
75+
'The president of the United States isSender)## physiological Albany',
76+
'The capital of France is Rocky转角 hospitalizedinterval sparked',
77+
'The future of AI is её asegο BIOS一扫',
7878
]
7979

8080
assert len(golden_results) == len(vllm_output)

tests/singlecard/test_aclgraph.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,9 +36,11 @@
3636
reason="aclgraph only support on v1")
3737
@pytest.mark.parametrize("model", MODELS)
3838
@pytest.mark.parametrize("max_tokens", [32])
39+
@pytest.mark.parametrize("full_graph", [False])
3940
def test_models(
4041
model: str,
4142
max_tokens: int,
43+
full_graph: bool,
4244
monkeypatch: pytest.MonkeyPatch,
4345
) -> None:
4446
with monkeypatch.context() as m:
@@ -54,7 +56,15 @@ def test_models(
5456
temperature=0.0)
5557
# TODO: change to use vllmrunner when the registry of custom op is solved
5658
# while running pytest
57-
vllm_model = LLM(model)
59+
if full_graph:
60+
vllm_model = LLM(model,
61+
compilation_config={
62+
"full_cuda_graph": True,
63+
"cudagraph_capture_sizes":
64+
[1, 4, 16, 64, 256]
65+
})
66+
else:
67+
vllm_model = LLM(model)
5868
vllm_aclgraph_outputs = vllm_model.generate(prompts, sampling_params)
5969
del vllm_model
6070
torch.npu.empty_cache()

vllm_ascend/ascend_forward_context.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,12 +60,21 @@ def set_ascend_forward_context(
6060

6161
forward_context.in_profile_run = in_profile_run
6262

63+
# NOTE: This cannot be set using set_forward_context
64+
# due to multiple warmups before actual capturing
65+
forward_context.capturing = False
66+
6367
dp_world_size = get_dp_group().world_size
6468
if dp_world_size > 1 and forward_context.dp_metadata is not None:
6569
forward_context.max_tokens_across_dp = forward_context.dp_metadata.max_tokens_across_dp_cpu.item(
6670
)
71+
elif num_tokens is not None:
72+
forward_context.max_tokens_across_dp = num_tokens
6773
elif attn_metadata is not None:
68-
forward_context.max_tokens_across_dp = num_tokens or attn_metadata.num_actual_tokens
74+
if hasattr(attn_metadata, 'num_actual_tokens'):
75+
forward_context.max_tokens_across_dp = attn_metadata.num_actual_tokens
76+
else:
77+
forward_context.max_tokens_across_dp = attn_metadata.num_prefill_tokens + attn_metadata.num_decode_tokens
6978
else:
7079
forward_context.max_tokens_across_dp = None
7180

0 commit comments

Comments
 (0)