|
20 | 20 | Run `pytest tests/multicard/test_torchair_graph_mode.py`.
|
21 | 21 | """
|
22 | 22 | import os
|
| 23 | +from typing import Dict |
23 | 24 |
|
24 | 25 | import pytest
|
25 | 26 |
|
|
28 | 29 | os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"
|
29 | 30 |
|
30 | 31 |
|
| 32 | +def _deepseek_torchair_test_fixture( |
| 33 | + additional_config: Dict, |
| 34 | + *, |
| 35 | + tensor_parallel_size=4, |
| 36 | +): |
| 37 | + example_prompts = [ |
| 38 | + "Hello, my name is", |
| 39 | + "The president of the United States is", |
| 40 | + "The capital of France is", |
| 41 | + "The future of AI is", |
| 42 | + ] |
| 43 | + |
| 44 | + # torchair is only work without chunked-prefill now |
| 45 | + kwargs = { |
| 46 | + "ascend_scheduler_config": { |
| 47 | + "enabled": True, |
| 48 | + }, |
| 49 | + "refresh": True, |
| 50 | + } |
| 51 | + additional_config.update(**kwargs) |
| 52 | + |
| 53 | + with VllmRunner( |
| 54 | + "vllm-ascend/DeepSeek-V3-Pruning", |
| 55 | + dtype="half", |
| 56 | + tensor_parallel_size=tensor_parallel_size, |
| 57 | + distributed_executor_backend="mp", |
| 58 | + enforce_eager=False, |
| 59 | + additional_config=additional_config, |
| 60 | + ) as vllm_model: |
| 61 | + # use greedy sampler to make sure the generated results are fix |
| 62 | + vllm_output = vllm_model.generate_greedy(example_prompts, 5) |
| 63 | + |
| 64 | + # NOTE: vllm-ascend/DeepSeek-V3-Pruning is a random weight of |
| 65 | + # DeepSeek-V3 with 2 hidden layers, thus the golden results seems |
| 66 | + # inaccurate. This will only change if accuracy improves with the |
| 67 | + # official weights of DeepSeek-V3. |
| 68 | + golden_results = [ |
| 69 | + 'Hello, my name is feasibility伸 spazio debtor添', |
| 70 | + 'The president of the United States is begg"""\n杭州风和 bestimm', |
| 71 | + 'The capital of France is frequentlyশามalinkAllowed', |
| 72 | + 'The future of AI is deleting俯احت怎么样了حراف', |
| 73 | + ] |
| 74 | + |
| 75 | + assert len(golden_results) == len(vllm_output) |
| 76 | + for i in range(len(vllm_output)): |
| 77 | + assert golden_results[i] == vllm_output[i][1] |
| 78 | + print(f"Generated text: {vllm_output[i][1]!r}") |
| 79 | + |
| 80 | + |
31 | 81 | @pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
|
32 | 82 | reason="torchair graph is not supported on v0")
|
33 |
| -def test_e2e_deepseekv3_with_torchair(monkeypatch: pytest.MonkeyPatch): |
34 |
| - with monkeypatch.context() as m: |
35 |
| - m.setenv("VLLM_USE_MODELSCOPE", "True") |
36 |
| - m.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn") |
| 83 | +def test_e2e_deepseekv3_with_torchair(): |
| 84 | + additional_config = { |
| 85 | + "torchair_graph_config": { |
| 86 | + "enabled": True, |
| 87 | + }, |
| 88 | + } |
| 89 | + _deepseek_torchair_test_fixture(additional_config) |
37 | 90 |
|
38 |
| - example_prompts = [ |
39 |
| - "Hello, my name is", |
40 |
| - "The president of the United States is", |
41 |
| - "The capital of France is", |
42 |
| - "The future of AI is", |
43 |
| - ] |
44 |
| - dtype = "half" |
45 |
| - max_tokens = 5 |
46 |
| - # torchair is only work without chunked-prefill now |
47 |
| - with VllmRunner( |
48 |
| - "vllm-ascend/DeepSeek-V3-Pruning", |
49 |
| - dtype=dtype, |
50 |
| - tensor_parallel_size=4, |
51 |
| - distributed_executor_backend="mp", |
52 |
| - additional_config={ |
53 |
| - "torchair_graph_config": { |
54 |
| - "enabled": True, |
55 |
| - }, |
56 |
| - "ascend_scheduler_config": { |
57 |
| - "enabled": True, |
58 |
| - }, |
59 |
| - "refresh": True, |
60 |
| - }, |
61 |
| - enforce_eager=False, |
62 |
| - ) as vllm_model: |
63 |
| - # use greedy sampler to make sure the generated results are fix |
64 |
| - vllm_output = vllm_model.generate_greedy(example_prompts, |
65 |
| - max_tokens) |
66 |
| - # NOTE: vllm-ascend/DeepSeek-V3-Pruning is a random weight of |
67 |
| - # DeepSeek-V3 with 2 hidden layers, thus the golden results seems |
68 |
| - # inaccurate. This will only change if accuracy improves with the |
69 |
| - # official weights of DeepSeek-V3. |
70 |
| - golden_results = [ |
71 |
| - 'Hello, my name is feasibility伸 spazio debtor添', |
72 |
| - 'The president of the United States is begg"""\n杭州风和 bestimm', |
73 |
| - 'The capital of France is frequentlyশามalinkAllowed', |
74 |
| - 'The future of AI is deleting俯احت怎么样了حراف', |
75 |
| - ] |
76 | 91 |
|
77 |
| - assert len(golden_results) == len(vllm_output) |
78 |
| - for i in range(len(vllm_output)): |
79 |
| - assert golden_results[i] == vllm_output[i][1] |
80 |
| - print(f"Generated text: {vllm_output[i][1]!r}") |
| 92 | +@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0", |
| 93 | + reason="torchair graph is not supported on v0") |
| 94 | +def test_e2e_deepseekv3_with_torchair_ms_mla(): |
| 95 | + additional_config = { |
| 96 | + "torchair_graph_config": { |
| 97 | + "enabled": True, |
| 98 | + "enable_multistream_mla": True, |
| 99 | + }, |
| 100 | + } |
| 101 | + _deepseek_torchair_test_fixture(additional_config) |
0 commit comments