Introduce and use CustomDeepseekV2MergedReplicatedLinear

sdmyzlp · sdmyzlp · commit ee82a179df1b · 2025-06-09T21:05:33.000+08:00
As the replicated version of MergedColumnParallelLinear, aiming at
removing TP communication of DeepSeek-V2's `gate_up_proj` linear.
Also, with replicated weight, the chunked input hidden_states can
be used by shared experts.

Signed-off-by: sdmyzlp &lt;lrwei2@petalmail.com&gt;
diff --git a/vllm_ascend/models/deepseek_v2.py b/vllm_ascend/models/deepseek_v2.py
@@ -98,6 +98,41 @@ def forward_oot(self, x: Union[torch.Tensor, Tuple[torch.Tensor,
             return super().forward_oot(x)
 
 
+class CustomDeepseekV2MergedReplicatedLinear(ReplicatedLinear):
+
+    def __init__(
+        self,
+        input_size: int,
+        output_sizes: list[int],
+        bias: bool = True,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        self.output_sizes = output_sizes
+        super().__init__(input_size,
+                         sum(output_sizes),
+                         bias=bias,
+                         quant_config=quant_config,
+                         prefix=prefix)
+
+    def weight_loader(self, param: torch.nn.Parameter,
+                      loaded_weight: torch.Tensor, loaded_shard_id: int):
+        # With no support for GGUF format yet.
+        assert not getattr(param, "is_gguf_weight", False)
+        assert not getattr(param, "is_gguf_weight_type", False)
+
+        assert loaded_shard_id < len(self.output_sizes)
+        shard_offset = sum(self.output_sizes[:loaded_shard_id])
+        shard_size = self.output_sizes[loaded_shard_id]
+        shard = param.data.narrow(param.output_dim, shard_offset, shard_size)
+
+        assert shard.size() == loaded_weight.size(), (
+            f"Tried to load weights of size {loaded_weight.size()}"
+            f"to a parameter shard of id {loaded_shard_id} size {shard.size()}"
+        )
+        shard.copy_(loaded_weight)
+
+
 class CustomDeepseekV2MLP(nn.Module):
 
     def __init__(
@@ -107,20 +142,33 @@ def __init__(
         hidden_act: str,
         quant_config: Optional[QuantizationConfig] = None,
         reduce_results: bool = True,
+        force_replicate: bool = False,
         prefix: str = "",
     ) -> None:
         super().__init__()
-        self.gate_up_proj = MergedColumnParallelLinear(
-            hidden_size, [intermediate_size] * 2,
-            bias=False,
-            quant_config=quant_config,
-            prefix=f"{prefix}.gate_up_proj")
-        self.down_proj = RowParallelLinear(intermediate_size,
-                                           hidden_size,
-                                           bias=False,
-                                           quant_config=quant_config,
-                                           reduce_results=reduce_results,
-                                           prefix=f"{prefix}.down_proj")
+        if not force_replicate:
+            self.gate_up_proj = MergedColumnParallelLinear(
+                hidden_size, [intermediate_size] * 2,
+                bias=False,
+                quant_config=quant_config,
+                prefix=f"{prefix}.gate_up_proj")
+            self.down_proj = RowParallelLinear(intermediate_size,
+                                               hidden_size,
+                                               bias=False,
+                                               quant_config=quant_config,
+                                               reduce_results=reduce_results,
+                                               prefix=f"{prefix}.down_proj")
+        else:
+            self.gate_up_proj = CustomDeepseekV2MergedReplicatedLinear(
+                hidden_size, [intermediate_size] * 2,
+                bias=False,
+                quant_config=quant_config,
+                prefix=f"{prefix}.gate_up_proj")
+            self.down_proj = ReplicatedLinear(intermediate_size,
+                                              hidden_size,
+                                              bias=False,
+                                              quant_config=quant_config,
+                                              prefix=f"{prefix}.down_proj")
         if hidden_act != "silu":
             raise ValueError(f"Unsupported activation: {hidden_act}. "
                              "Only silu is supported for now.")
@@ -181,6 +229,12 @@ def __init__(
             raise ValueError(f"Unsupported activation: {config.hidden_act}. "
                              "Only silu is supported for now.")
 
+        ascend_config = get_ascend_config()
+        self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled
+        # NOTE: multistream only effective when `VLLM_ENABLE_MC2` is on
+        self.enable_multistream_moe = \
+            ascend_config.torchair_graph_config.enable_multistream_moe and VLLM_ENABLE_MC2
+
         self.gate = ReplicatedLinear(config.hidden_size,
                                      config.n_routed_experts,
                                      bias=False,
@@ -216,6 +270,7 @@ def __init__(
                 hidden_act=config.hidden_act,
                 quant_config=quant_config,
                 reduce_results=True,
+                force_replicate=self.enable_multistream_moe,
                 prefix=f"{prefix}.shared_experts",
             )
         else:
@@ -230,12 +285,6 @@ def __init__(
 
         self.params_dtype = torch.get_default_dtype()
 
-        ascend_config = get_ascend_config()
-        self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled
-        # NOTE: multistream only effective when `VLLM_ENABLE_MC2` is on
-        self.enable_multistream_moe = \
-            ascend_config.torchair_graph_config.enable_multistream_moe and VLLM_ENABLE_MC2
-
     def forward(
             self,
             hidden_states: torch.Tensor,
@@ -274,27 +323,22 @@ def forward(
         # router_logits: (num_tokens, n_experts)
         router_logits, _ = self.gate(hidden_states)
 
-        kwargs = {}
-        if not use_separated_shared_experts:
-            kwargs.update({
-                "shared_experts": self.shared_experts,
-                "shared_experts_input": old_hidden_states
-            })
-
         experts_hidden_states = self.experts(
             hidden_states=hidden_states,
             router_logits=router_logits,
             is_prefill=is_prefill,
             top_k=CustomDeepseekV2MoE.top_k,
             enable_force_load_balance=enable_force_load_balance,
-            **kwargs)
+            shared_experts=(self.shared_experts
+                            if not use_separated_shared_experts else None),
+        )
 
         if not isinstance(experts_hidden_states, tuple):
             hidden_states = experts_hidden_states * self.routed_scaling_factor
         else:
-            hidden_states = experts_hidden_states[
-                0] * self.routed_scaling_factor
-            shared_hidden_states = experts_hidden_states[1]
+            hidden_states = (
+                experts_hidden_states[0] * self.routed_scaling_factor +
+                experts_hidden_states[1])
 
         if self.tp_size > 1:
             if (VLLM_ENABLE_MC2
@@ -309,10 +353,8 @@ def forward(
                 hidden_states = tensor_model_parallel_all_reduce(hidden_states)
 
         if use_separated_shared_experts:
-            shared_hidden_states = self.shared_experts(old_hidden_states)
-
-        if self.shared_experts is not None:
-            hidden_states = hidden_states + shared_hidden_states
+            hidden_states = hidden_states + self.shared_experts(
+                old_hidden_states)
 
         return hidden_states.view(num_tokens, hidden_size)
 
diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py
@@ -16,7 +16,7 @@
 # Adapted from vllm/tests/kernels/test_moe.py
 
 import os
-from typing import Callable, List, Optional
+from typing import Any, Callable, List, Optional
 
 import torch
 import torch.distributed as dist
@@ -1099,8 +1099,8 @@ def forward(self,
                 router_logits: torch.Tensor,
                 is_prefill: bool,
                 enable_force_load_balance: bool = False,
-                top_k=None,
-                **kwargs):
+                top_k: Optional[int] = None,
+                shared_experts: Optional[Any] = None):
         assert self.quant_method is not None
 
         if top_k:
@@ -1147,14 +1147,13 @@ def forward(self,
             enable_force_load_balance=enable_force_load_balance,
             log2phy=self.log2phy,
             global_redundant_expert_num=self.global_redundant_expert_num,
-            **kwargs)
+            shared_experts=shared_experts,
+        )
 
-        shared_experts = kwargs.get("shared_experts", None)
-        shared_experts_input = kwargs.get("shared_experts_input", None)
         if shared_experts is not None:
             # Provide dummy implementation of "non-separated" shared experts.
             if not isinstance(e_hidden_states, tuple):
-                return e_hidden_states, shared_experts(shared_experts_input)
+                return e_hidden_states, shared_experts(hidden_states)
             else:
                 return e_hidden_states
 
diff --git a/vllm_ascend/quantization/w8a8_dynamic.py b/vllm_ascend/quantization/w8a8_dynamic.py
@@ -105,19 +105,21 @@ def apply_mlp(hidden_states: torch.Tensor,
     return hidden_states
 
 
-def fused_experts_with_mc2(hidden_states: torch.Tensor,
-                           w1: torch.Tensor,
-                           w2: torch.Tensor,
-                           w1_scale: torch.Tensor,
-                           w2_scale: torch.Tensor,
-                           topk_weights: torch.Tensor,
-                           topk_ids: torch.Tensor,
-                           top_k: int,
-                           expert_map: torch.Tensor = None,
-                           moe_all_to_all_group_name: str = "",
-                           log2phy: torch.Tensor = None,
-                           global_redundant_expert_num: int = 0,
-                           **kwargs) -> torch.Tensor:
+def fused_experts_with_mc2(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    top_k: int,
+    expert_map: torch.Tensor = None,
+    moe_all_to_all_group_name: str = "",
+    log2phy: torch.Tensor = None,
+    global_redundant_expert_num: int = 0,
+    shared_experts: Optional[Any] = None,
+) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
 
     topk_ids = log2phy[topk_ids]
     global_bs = 0
@@ -161,13 +163,10 @@ def fused_experts_with_mc2(hidden_states: torch.Tensor,
     expand_x, dynamic_scale, expand_idx, expert_token_nums, ep_recv_counts = output[
         0:5]
 
-    shared_experts = kwargs.get("shared_experts", None)
-    shared_experts_input = kwargs.get("shared_experts_input", None)
     if shared_experts is not None:
         with npu_stream_switch("moe_secondary", 0):
-            npu_wait_tensor(shared_experts_input, topk_weights)
-            shared_gate_up, _ = shared_experts.gate_up_proj(
-                shared_experts_input)
+            npu_wait_tensor(hidden_states, topk_weights)
+            shared_gate_up, _ = shared_experts.gate_up_proj(hidden_states)
             npu_wait_tensor(shared_gate_up[0], expand_x)
             shared_act = shared_experts.act_fn(shared_gate_up)
 
@@ -615,6 +614,7 @@ def apply(
         enable_force_load_balance: bool = True,
         log2phy: torch.Tensor = None,
         global_redundant_expert_num: int = 0,
+        shared_experts: Optional[Any] = None,
         **kwargs,
     ) -> torch.Tensor:
         assert router_logits.shape[
@@ -671,7 +671,7 @@ def apply(
                 moe_all_to_all_group_name=self.moe_all_to_all_group_name,
                 log2phy=log2phy,
                 global_redundant_expert_num=global_redundant_expert_num,
-                **kwargs)
+                shared_experts=shared_experts)
         elif self.torchair_graph_enabled or self.ep_group.world_size == 1:
             return fused_experts(hidden_states=x,
                                  w1=layer.w13_weight,