[BugFix]dbo support torchair graph in decode (#1420)

shikang-hangzhou · web-flow · commit 3191183c3ef2 · 2025-06-26T09:44:36.000+08:00
### What this PR does / why we need it?
DBO support torchair graph model in decode, make its possible to set
`"torchair_graph_config": {"enabled": True } ` when using DBO mode .

### Does this PR introduce _any_ user-facing change?
None

### How was this patch tested?
add ut test case `tests/multicard/test_torchair_graph_mode.py`

Signed-off-by: shikang-hangzhou &lt;459956190@qq.com&gt;
diff --git a/tests/multicard/test_torchair_graph_mode.py b/tests/multicard/test_torchair_graph_mode.py
@@ -30,10 +30,13 @@
 
 @pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
                     reason="torchair graph is not supported on v0")
-def test_e2e_deepseekv3_with_torchair(monkeypatch: pytest.MonkeyPatch):
+@pytest.mark.parametrize("VLLM_ASCEND_ENABLE_DBO", ["0", "1"])
+def test_e2e_deepseekv3_with_torchair(monkeypatch: pytest.MonkeyPatch,
+                                      VLLM_ASCEND_ENABLE_DBO):
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_MODELSCOPE", "True")
         m.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
+        m.setenv("VLLM_ASCEND_ENABLE_DBO", VLLM_ASCEND_ENABLE_DBO)
 
         example_prompts = [
             "Hello, my name is",
diff --git a/vllm_ascend/models/deepseek_dbo.py b/vllm_ascend/models/deepseek_dbo.py
@@ -43,7 +43,8 @@
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                ReplicatedLinear,
-                                               RowParallelLinear)
+                                               RowParallelLinear,
+                                               UnquantizedLinearMethod)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
@@ -75,31 +76,56 @@
                                               MultiStreamStepMetadata,
                                               make_multistream_metadata_ds)
 from vllm_ascend.ops.fused_moe import AscendFusedMoE
+from vllm_ascend.quantization.w8a8_dynamic import AscendW8A8DynamicLinearMethod
 from vllm_ascend.utils import dispose_tensor
 
 VLLM_ASCEND_ENABLE_DBO: bool = envs_ascend.VLLM_ASCEND_ENABLE_DBO
 
 
 class CustomDeepseekDBOMLP(CustomDeepseekV2MLP):
 
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        reduce_results: bool = True,
+        prefix: str = "",
+    ) -> None:
+        super().__init__(hidden_size=hidden_size,
+                         intermediate_size=intermediate_size,
+                         hidden_act=hidden_act,
+                         quant_config=quant_config,
+                         prefix=prefix)
+        self.is_dynamic_quant = not isinstance(
+            self.gate_up_proj.quant_method,
+            UnquantizedLinearMethod) and isinstance(
+                self.gate_up_proj.quant_method.quant_method,
+                AscendW8A8DynamicLinearMethod)
+
     def _forward_ms_mlp(self, x):
         current_ms_metadata = get_multistream_comm_context()
         assert current_ms_metadata is not None
         gate_up, _ = self.gate_up_proj(x)
-        x, dynamic_scale = self.act_fn(gate_up)
-        x = torch_npu.npu_quant_matmul(
-            x,
-            self.down_proj.weight,
-            self.down_proj.weight_scale,
-            pertoken_scale=dynamic_scale,
-            output_dtype=torch.bfloat16,
-        )
-        if self.down_proj.reduce_results and self.down_proj.tp_size > 1:
-            current_ms_metadata.before_comm_event.record()
-            with torch.npu.stream(current_ms_metadata.comm_stream):
-                current_ms_metadata.before_comm_event.wait()
-                x = tensor_model_parallel_all_reduce(x)
-                current_ms_metadata.after_comm_event.record()
+        if self.is_dynamic_quant:
+            x, dynamic_scale = self.act_fn(gate_up)
+            x = torch_npu.npu_quant_matmul(
+                x,
+                self.down_proj.weight,
+                self.down_proj.weight_scale,
+                pertoken_scale=dynamic_scale,
+                output_dtype=torch.bfloat16,
+            )
+            if self.down_proj.reduce_results and self.down_proj.tp_size > 1:
+                current_ms_metadata.before_comm_event.record()
+                with torch.npu.stream(current_ms_metadata.comm_stream):
+                    current_ms_metadata.before_comm_event.wait()
+                    x = tensor_model_parallel_all_reduce(x)
+                    current_ms_metadata.after_comm_event.record()
+        else:
+            x = self.act_fn(gate_up)
+            x, _ = self.down_proj(x)
         return x
 
 
@@ -796,6 +822,7 @@ def forward(
         attn_metadata: Optional[AttentionMetadata] = None,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
+        graph_enable: Optional[bool] = True
     ) -> Union[torch.Tensor, IntermediateTensors]:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
@@ -809,8 +836,9 @@ def forward(
             residual = intermediate_tensors["residual"]
 
         num_normal_layers = (self.first_k_dense_replace
-                             if VLLM_ASCEND_ENABLE_DBO and self.can_run_ms()
-                             else self.end_layer - self.start_layer)
+                             if VLLM_ASCEND_ENABLE_DBO and not graph_enable
+                             and self.can_run_ms() else self.end_layer -
+                             self.start_layer)
 
         moe_start_layer = self.start_layer + num_normal_layers
         for i in range(self.start_layer, min(moe_start_layer, self.end_layer)):
@@ -847,15 +875,13 @@ def can_run_ms(self):
             return False
         return True
 
-    def _forward_ms_layers(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-        residual: torch.Tensor,
-        moe_start_layer: int,
-        kv_caches: Optional[List[torch.Tensor]] = None,
-        is_prefill: bool = False,
-    ):
+    def _forward_ms_layers(self,
+                           positions: torch.Tensor,
+                           hidden_states: torch.Tensor,
+                           residual: torch.Tensor,
+                           moe_start_layer: int,
+                           kv_caches: Optional[List[torch.Tensor]] = None,
+                           is_prefill: bool = False):
 
         if moe_start_layer == self.end_layer:
             return hidden_states, residual
@@ -917,8 +943,9 @@ def forward(
         attn_metadata: Optional[AttentionMetadata] = None,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
+        graph_enable: Optional[bool] = True
     ) -> Union[torch.Tensor, IntermediateTensors]:
         hidden_states = self.model(input_ids, positions, kv_caches,
                                    attn_metadata, intermediate_tensors,
-                                   inputs_embeds)
+                                   inputs_embeds, graph_enable)
         return hidden_states
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
@@ -1037,6 +1037,8 @@ def _process_reqs(
                 if self.torchair_graph_enabled:
                     model_kwargs["kv_caches"] = self.kv_caches
                     model_kwargs["attn_metadata"] = attn_metadata
+                if envs_ascend.VLLM_ASCEND_ENABLE_DBO and with_prefill:
+                    model_kwargs["graph_enable"] = False  # type: ignore
                 if self.torchair_graph_enabled and not with_prefill:
                     compiled_model = self._get_torchair_lazy_compiled_model(
                         padded_num_tokens_across_dp)
@@ -1045,17 +1047,15 @@ def _process_reqs(
                         positions=positions,
                         intermediate_tensors=intermediate_tensors,
                         inputs_embeds=inputs_embeds,
-                        **model_kwargs,
-                    )
+                        **model_kwargs)
                 else:
                     assert self.model is not None
                     hidden_states = self.model(
                         input_ids=input_ids,
                         positions=positions,
                         intermediate_tensors=intermediate_tensors,
                         inputs_embeds=inputs_embeds,
-                        **model_kwargs,
-                    )
+                        **model_kwargs)
 
         self.maybe_wait_for_kv_save()
         finished_sending, finished_recving = self.get_finished_kv_transfer(
@@ -1586,6 +1586,7 @@ def _dummy_run(
                     num_tokens_across_dp=num_tokens_across_dp,
                     with_prefill=with_prefill,
                     in_profile_run=self.in_profile_run):
+                model_kwargs = {}
                 if self.torchair_graph_enabled and not with_prefill:
                     # Only mark static while compiling
                     if is_torchair_compile:
@@ -1603,20 +1604,26 @@ def _dummy_run(
                             torch._dynamo.mark_static(kv[1])
                     compiled_model = self._get_torchair_lazy_compiled_model(
                         num_tokens)
+                    model_kwargs["kv_caches"] = self.kv_caches
+                    model_kwargs["attn_metadata"] = attn_metadata
+                    if envs_ascend.VLLM_ASCEND_ENABLE_DBO:
+                        model_kwargs["graph_enable"] = True  # type: ignore
                     hidden_states = compiled_model(
                         input_ids=input_ids,
                         positions=positions,
                         intermediate_tensors=intermediate_tensors,
                         inputs_embeds=None,
-                        kv_caches=self.kv_caches,
-                        attn_metadata=attn_metadata,
+                        **model_kwargs,
                     )
                 else:
+                    if envs_ascend.VLLM_ASCEND_ENABLE_DBO:
+                        model_kwargs["graph_enable"] = False  # type: ignore
                     hidden_states = model(
                         input_ids=input_ids,
                         positions=positions,
                         intermediate_tensors=intermediate_tensors,
-                        inputs_embeds=inputs_embeds)
+                        inputs_embeds=inputs_embeds,
+                        **model_kwargs)
                 return hidden_states
 
     @contextmanager