vllm-project · wangxiyuan · Jul 11, 2025 · Jun 24, 2025 · Jun 24, 2025 · Jun 25, 2025
diff --git a/examples/run_dp_attention_etp16.sh b/examples/run_dp_attention_etp16.sh
@@ -4,6 +4,7 @@ source /usr/local/Ascend/ascend-toolkit/set_env.sh
 source /usr/local/Ascend/nnal/atb/set_env.sh
 export ASCEND_LAUNCH_BLOCKING=0
 export VLLM_VERSION=0.9.0
+export VLLM_ASCEND_RM_ROUTER_LOGITS=1
 
 nohup python -m vllm.entrypoints.openai.api_server --model=/mnt/deepseek/DeepSeek-R1-W8A8-VLLM \
     --quantization ascend \

diff --git a/vllm_ascend/envs.py b/vllm_ascend/envs.py
@@ -121,6 +121,9 @@
     # value to False to disable the optimized model.
     "USE_OPTIMIZED_MODEL":
     lambda: bool(int(os.getenv('USE_OPTIMIZED_MODEL', '1'))),
+    # Remove the two communications of get_dp_group().all_gather and change it to one, and do gate after the communication
+    "VLLM_ASCEND_RM_ROUTER_LOGITS":
+    lambda: int(os.getenv("VLLM_ASCEND_RM_ROUTER_LOGITS", 0)),
 }
 
 # end-env-vars-definition

diff --git a/vllm_ascend/models/deepseek_v2.py b/vllm_ascend/models/deepseek_v2.py
@@ -67,6 +67,7 @@
     maybe_prefix)
 from vllm.sequence import IntermediateTensors
 
+import vllm_ascend.envs as envs_ascend
 from vllm_ascend.ascend_config import get_ascend_config
 from vllm_ascend.distributed.parallel_state import get_ep_group
 from vllm_ascend.ops.fused_moe import AscendFusedMoE
@@ -365,6 +366,7 @@ def __init__(
         self.ep_group = get_ep_group()
 
         self.params_dtype = torch.get_default_dtype()
+        self.rm_router_logits = envs_ascend.VLLM_ASCEND_RM_ROUTER_LOGITS
 
     def forward(self,
                 hidden_states: torch.Tensor,
@@ -387,7 +389,9 @@ def forward(self,
                 is_prefill = is_prefill or attn_metadata.with_prefill_across_dp
 
         # router_logits: (num_tokens, n_experts)
-        router_logits, _ = self.gate(hidden_states)
+        router_logits = None
+        if not self.rm_router_logits:
+            router_logits, _ = self.gate(hidden_states)
 
         experts_hidden_states = self.experts(
             hidden_states=hidden_states,
@@ -396,6 +400,7 @@ def forward(self,
             top_k=CustomDeepseekV2MoE.top_k,
             enable_force_load_balance=enable_force_load_balance,
             shared_experts=self.shared_experts,
+            gate=self.gate,
             replace_allreduce=replace_allreduce)
 
         hidden_states = (

diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py
@@ -1137,6 +1137,7 @@ def __init__(
         self.activation = activation
         self.log2phy = None
         self.global_redundant_expert_num = 0
+        self.rm_router_logits = envs_ascend.VLLM_ASCEND_RM_ROUTER_LOGITS
 
         ascend_config = get_ascend_config()
         expert_map_path = ascend_config.expert_map_path
@@ -1212,7 +1213,9 @@ def forward(self,
                 enable_force_load_balance: bool = False,
                 top_k: Optional[int] = None,
                 shared_experts: Optional[Any] = None,
+                gate=None,
                 replace_allreduce: bool = False):
+
         assert self.quant_method is not None
 
         if top_k:
@@ -1257,11 +1260,16 @@ def forward(self,
                         hidden_states = nn.functional.pad(
                             hidden_states,
                             (0, 0, 0, max_num_tokens_across_dp - num_tokens))
-                        router_logits = nn.functional.pad(
-                            router_logits,
-                            (0, 0, 0, max_num_tokens_across_dp - num_tokens))
+                        if not self.rm_router_logits:
+                            router_logits = nn.functional.pad(
+                                router_logits,
+                                (0, 0, 0,
+                                 max_num_tokens_across_dp - num_tokens))
             hidden_states = get_dp_group().all_gather(hidden_states, 0)
-            router_logits = get_dp_group().all_gather(router_logits, 0)
+            if self.rm_router_logits:
+                router_logits, _ = gate(hidden_states)
+            else:
+                router_logits = get_dp_group().all_gather(router_logits, 0)
 
         # Matrix multiply.
         e_hidden_states = self.quant_method.apply(