20
20
Run `pytest tests/multicard/test_torchair_graph_mode.py`.
21
21
"""
22
22
import os
23
+ from typing import Dict
23
24
24
25
import pytest
25
26
28
29
os .environ ["PYTORCH_NPU_ALLOC_CONF" ] = "max_split_size_mb:256"
29
30
30
31
32
+ def _deepseek_torchair_test_fixture (
33
+ additional_config : Dict ,
34
+ * ,
35
+ tensor_parallel_size = 4 ,
36
+ ):
37
+ example_prompts = [
38
+ "Hello, my name is" ,
39
+ "The president of the United States is" ,
40
+ "The capital of France is" ,
41
+ "The future of AI is" ,
42
+ ]
43
+
44
+ # torchair is only work without chunked-prefill now
45
+ kwargs = {
46
+ "ascend_scheduler_config" : {
47
+ "enabled" : True ,
48
+ },
49
+ "refresh" : True ,
50
+ }
51
+ additional_config .update (** kwargs )
52
+
53
+ with VllmRunner (
54
+ "vllm-ascend/DeepSeek-V3-Pruning" ,
55
+ dtype = "half" ,
56
+ tensor_parallel_size = tensor_parallel_size ,
57
+ distributed_executor_backend = "mp" ,
58
+ enforce_eager = False ,
59
+ additional_config = additional_config ,
60
+ ) as vllm_model :
61
+ # use greedy sampler to make sure the generated results are fix
62
+ vllm_output = vllm_model .generate_greedy (example_prompts , 5 )
63
+
64
+ # NOTE: vllm-ascend/DeepSeek-V3-Pruning is a random weight of
65
+ # DeepSeek-V3 with 2 hidden layers, thus the golden results seems
66
+ # inaccurate. This will only change if accuracy improves with the
67
+ # official weights of DeepSeek-V3.
68
+ golden_results = [
69
+ 'Hello, my name is feasibility伸 spazio debtor添' ,
70
+ 'The president of the United States is begg"""\n 杭州风和 bestimm' ,
71
+ 'The capital of France is frequentlyশามalinkAllowed' ,
72
+ 'The future of AI is deleting俯احت怎么样了حراف' ,
73
+ ]
74
+
75
+ assert len (golden_results ) == len (vllm_output )
76
+ for i in range (len (vllm_output )):
77
+ assert golden_results [i ] == vllm_output [i ][1 ]
78
+ print (f"Generated text: { vllm_output [i ][1 ]!r} " )
79
+
80
+
31
81
@pytest .mark .skipif (os .getenv ("VLLM_USE_V1" ) == "0" ,
32
82
reason = "torchair graph is not supported on v0" )
33
83
@pytest .mark .parametrize ("VLLM_ASCEND_ENABLE_DBO" , ["0" , "1" ])
@@ -38,46 +88,25 @@ def test_e2e_deepseekv3_with_torchair(monkeypatch: pytest.MonkeyPatch,
38
88
m .setenv ("VLLM_WORKER_MULTIPROC_METHOD" , "spawn" )
39
89
m .setenv ("VLLM_ASCEND_ENABLE_DBO" , VLLM_ASCEND_ENABLE_DBO )
40
90
41
- example_prompts = [
42
- "Hello, my name is" ,
43
- "The president of the United States is" ,
44
- "The capital of France is" ,
45
- "The future of AI is" ,
46
- ]
47
- dtype = "half"
48
- max_tokens = 5
49
- # torchair is only work without chunked-prefill now
50
- with VllmRunner (
51
- "vllm-ascend/DeepSeek-V3-Pruning" ,
52
- dtype = dtype ,
53
- tensor_parallel_size = 4 ,
54
- distributed_executor_backend = "mp" ,
55
- additional_config = {
56
- "torchair_graph_config" : {
57
- "enabled" : True ,
58
- },
59
- "ascend_scheduler_config" : {
60
- "enabled" : True ,
61
- },
62
- "refresh" : True ,
63
- },
64
- enforce_eager = False ,
65
- ) as vllm_model :
66
- # use greedy sampler to make sure the generated results are fix
67
- vllm_output = vllm_model .generate_greedy (example_prompts ,
68
- max_tokens )
69
- # NOTE: vllm-ascend/DeepSeek-V3-Pruning is a random weight of
70
- # DeepSeek-V3 with 2 hidden layers, thus the golden results seems
71
- # inaccurate. This will only change if accuracy improves with the
72
- # official weights of DeepSeek-V3.
73
- golden_results = [
74
- 'Hello, my name is feasibility伸 spazio debtor添' ,
75
- 'The president of the United States is begg"""\n 杭州风和 bestimm' ,
76
- 'The capital of France is frequentlyশามalinkAllowed' ,
77
- 'The future of AI is deleting俯احت怎么样了حراف' ,
78
- ]
79
-
80
- assert len (golden_results ) == len (vllm_output )
81
- for i in range (len (vllm_output )):
82
- assert golden_results [i ] == vllm_output [i ][1 ]
83
- print (f"Generated text: { vllm_output [i ][1 ]!r} " )
91
+ additional_config = {
92
+ "torchair_graph_config" : {
93
+ "enabled" : True ,
94
+ },
95
+ }
96
+ _deepseek_torchair_test_fixture (additional_config )
97
+
98
+
99
+ @pytest .mark .skipif (os .getenv ("VLLM_USE_V1" ) == "0" ,
100
+ reason = "torchair graph is not supported on v0" )
101
+ def test_e2e_deepseekv3_with_torchair_ms_mla (monkeypatch : pytest .MonkeyPatch ):
102
+ with monkeypatch .context () as m :
103
+ m .setenv ("VLLM_USE_MODELSCOPE" , "True" )
104
+ m .setenv ("VLLM_WORKER_MULTIPROC_METHOD" , "spawn" )
105
+
106
+ additional_config = {
107
+ "torchair_graph_config" : {
108
+ "enabled" : True ,
109
+ "enable_multistream_mla" : True ,
110
+ },
111
+ }
112
+ _deepseek_torchair_test_fixture (additional_config )
0 commit comments