add mc2 mask

weiguihua2 · weiguihua2 · commit 8de805a4ac55 · 2025-07-08T16:31:48.000+08:00
Signed-off-by: weiguihua2 &lt;weiguihua2@huawei.com&gt;
diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py
@@ -82,6 +82,8 @@ class ChunkedContextMetadata:
     max_query_len: int
     max_seq_lens: int
     chunked_context: Optional[ChunkedContextMetadata] = None
+    sin: torch.Tensor = None
+    cos: torch.Tensor = None
 
 
 @dataclass
@@ -95,6 +97,8 @@ class AscendMLADecodeMetadata:
     seq_lens_list: list[int]
     actual_seq_q_lens: Optional[list[int]] = None
     attn_mask: Optional[torch.Tensor] = None
+    sin: torch.Tensor = None
+    cos: torch.Tensor = None
     mc2_mask: Optional[torch.Tensor] = None
 
 
@@ -207,9 +211,14 @@ def __init__(self,
             )
         ascend_config = get_ascend_config()
         self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled
-
-    def generate_active_mask(self, actual_seqs_num, batch_size):
-        mc2_mask = torch.zeros(batch_size, dtype=torch.bool, device=current_platform.device_type)
+        self.rope_dim = self.runner.model_config.hf_text_config.qk_rope_head_dim
+        self.cos_cache = None
+        self.sin_cache = None
+
+    def generate_activate_mask(self, actual_seqs_num, batch_size):
+        mc2_mask = torch.zeros(batch_size,
+                               dtype=torch.bool,
+                               device=current_platform.device_type)
         mc2_mask[:actual_seqs_num].fill_(True)
         return mc2_mask
 
@@ -343,7 +352,19 @@ def build_torchair_graph_dummy(
         else:
             attn_state = AscendAttentionState.DecodeOnly
             num_decode_tokens = 1
-        mc2_mask = self.generate_active_mask(num_actual_tokens, num_reqs)
+        sin = torch.ones(num_reqs,
+                         1,
+                         1,
+                         self.rope_dim,
+                         dtype=self.runner.dtype,
+                         device=device)
+        cos = torch.ones(num_reqs,
+                         1,
+                         1,
+                         self.rope_dim,
+                         dtype=self.runner.dtype,
+                         device=device)
+        mc2_mask = self.generate_activate_mask(num_actual_tokens, num_reqs)
         decode_metadata = AscendMLADecodeMetadata(
             input_positions=input_positions,
             block_table=block_table,
@@ -352,8 +373,9 @@ def build_torchair_graph_dummy(
             max_seq_lens=1,
             attn_mask=self.runner.spec_attn_mask,
             actual_seq_q_lens=self.runner.actual_seq_q_lens[:num_reqs],
-            mc2_mask=mc2_mask,
-        )
+            sin=sin,
+            cos=cos,
+            mc2_mask=mc2_mask)
         return self.metadata_cls(  # type: ignore
             num_input_tokens=num_actual_tokens,
             num_actual_tokens=num_actual_tokens,
@@ -405,6 +427,16 @@ def build(
         max_query_len = query_lens.max().item()
         max_seq_lens = seq_lens.max().item()
         query_start_loc = common_attn_metadata.query_start_loc
+        if self.cos_cache is None:
+            self.cos_cache = self.runner.get_model(
+            ).model.layers[0].self_attn.rotary_emb.cos_cached
+            self.sin_cache = self.runner.get_model(
+            ).model.layers[0].self_attn.rotary_emb.sin_cached
+        if self.cos_cache.dtype != self.runner.dtype:  # type: ignore
+            self.cos_cache = self.cos_cache.to(  # type: ignore
+                self.runner.dtype)  # type: ignore
+            self.sin_cache = self.sin_cache.to(  # type: ignore
+                self.runner.dtype)  # type: ignore
 
         prefill_metadata = None
         chunked_context_metadata = None
@@ -451,18 +483,26 @@ def build(
                     chunk_seq_lens=chunk_seq_lens,
                     workspace=self.chunked_prefill_workspace,
                 )
-
+            prefill_input_positions = input_positions[tokens_start:]
+            cos = self.cos_cache[
+                prefill_input_positions].unsqueeze(  # type: ignore
+                    1).unsqueeze(2)
+            sin = self.sin_cache[
+                prefill_input_positions].unsqueeze(  # type: ignore
+                    1).unsqueeze(2)
             prefill_metadata = AscendMLAPrefillMetadata(
                 attn_mask=self.runner.attn_mask,
                 query_lens=query_lens[tokens_start:],
                 seq_lens=seq_lens,
                 context_lens=seq_lens[tokens_start:],
-                input_positions=input_positions[tokens_start:],
+                input_positions=prefill_input_positions,
                 block_table=block_table[reqs_start:, ...],
                 max_query_len=max_query_len,
                 max_seq_lens=max_seq_lens,
                 query_start_loc=prefill_query_start_loc,
                 chunked_context=chunked_context_metadata,
+                sin=sin,
+                cos=cos,
             )
 
         decode_metadata = None
@@ -507,9 +547,17 @@ def build(
                 actual_seq_q_lens = query_start_loc[1:].tolist(
                 ) + self.runner.actual_seq_q_lens[num_reqs:num_reqs +
                                                   num_reqs_pad_size]
+                cos = self.cos_cache[
+                    input_positions].unsqueeze(  # type: ignore
+                        1).unsqueeze(2)
+                sin = self.sin_cache[
+                    input_positions].unsqueeze(  # type: ignore
+                        1).unsqueeze(2)
             else:
                 seq_lens_list = seq_lens.tolist()
-            mc2_mask = self.generate_active_mask(num_actual_tokens, num_reqs)
+                cos, sin = None, None
+            mc2_mask = self.generate_activate_mask(
+                num_actual_tokens, num_reqs + num_reqs_pad_size)
 
             decode_metadata = AscendMLADecodeMetadata(
                 input_positions=input_positions,
@@ -519,8 +567,9 @@ def build(
                 max_seq_lens=max_seq_lens,
                 attn_mask=self.runner.spec_attn_mask,
                 actual_seq_q_lens=actual_seq_q_lens,
-                mc2_mask=mc2_mask,
-            )
+                sin=sin,
+                cos=cos,
+                mc2_mask=mc2_mask)
 
         return self.metadata_cls(  # type: ignore
             num_actual_tokens=num_actual_tokens,
@@ -1112,15 +1161,8 @@ def forward(
             decode_k_nope = None
             assert attn_metadata.decode is not None
             if self.running_in_graph:
-                seq_len = self.rotary_emb.max_position_embeddings * self.rotary_emb.scaling_factor
-                cos = self.rotary_emb.cos_cached[:seq_len].to(
-                    dtype=decode_hs_or_q_c.dtype)
-                sin = self.rotary_emb.sin_cached[:seq_len].to(
-                    dtype=decode_hs_or_q_c.dtype)
-                cos = cos[attn_metadata.decode.input_positions]
-                sin = sin[attn_metadata.decode.input_positions]
-                cos = cos[:, None, None, :]
-                sin = sin[:, None, None, :]
+                cos = attn_metadata.decode.cos
+                sin = attn_metadata.decode.sin
                 # Without explicitly controlling the order, IndexByTensor operations
                 # would be placed after `matmul W_KV_T` hindering the overlapping of
                 # KvRmsNormRopeCache and SingleRope.
@@ -1155,15 +1197,8 @@ def forward(
             prefill_q_nope = prefill_q[..., :self.qk_nope_head_dim]
             if self.torchair_graph_enabled:
                 num_tokens = prefill_hs_or_q_c.shape[0]
-                seq_len = self.rotary_emb.max_position_embeddings * self.rotary_emb.scaling_factor
-                cos = self.rotary_emb.cos_cached[:seq_len].to(
-                    dtype=prefill_q_pe.dtype)
-                sin = self.rotary_emb.sin_cached[:seq_len].to(
-                    dtype=prefill_q_pe.dtype)
-                cos = cos[attn_metadata.prefill.input_positions]
-                sin = sin[attn_metadata.prefill.input_positions]
-                cos = cos[:, None, None, :]
-                sin = sin[:, None, None, :]
+                cos = attn_metadata.prefill.cos
+                sin = attn_metadata.prefill.sin
 
                 prefill_q_pe = self.rope_single(prefill_q_pe, cos, sin)
                 prefill_k_pe, prefill_k_nope = self.exec_kv_prefill(
diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py
@@ -138,7 +138,7 @@ def fused_experts_with_mc2(
     # NOTE: Currently, when in A3 or in torchair graph, we need to pass in some extra param into dispatch & combine
     need_extra_args = (get_ascend_soc_version() == AscendSocVersion.A3
                        or is_torchair)
-    
+
     # NOTE: Currently, when in A3, we need to pass in some extra param into dispatch & combine
     a3_need_extra_args = get_ascend_soc_version() == AscendSocVersion.A3
 
@@ -1168,7 +1168,7 @@ def forward(self,
         if shared_experts:
             if not self.enable_multistream_moe or fused_moe_state != FusedMoEState.MC2:
                 shared_hidden_states = shared_experts(hidden_states)
-        
+
         attn_metadata = get_forward_context().attn_metadata
         mc2_mask = attn_metadata.decode.mc2_mask if attn_metadata is not None and attn_metadata.decode is not None else None
 
@@ -1179,6 +1179,9 @@ def forward(self,
                     hidden_states, (0, 0, 0, tp_size - num_tokens))
                 router_logits = nn.functional.pad(
                     router_logits, (0, 0, 0, tp_size - num_tokens))
+                if mc2_mask is not None:
+                    mc2_mask = nn.functional.pad(mc2_mask,
+                                                 (0, tp_size - num_tokens))
             chunk_hidden_states = torch.tensor_split(hidden_states,
                                                      tp_size,
                                                      dim=0)
@@ -1188,9 +1191,11 @@ def forward(self,
             tp_rank = get_tensor_model_parallel_rank()
             hidden_states = chunk_hidden_states[tp_rank]
             router_logits = chunk_router_logits[tp_rank]
+
             if mc2_mask is not None:
                 chunk_mc2_mask = torch.tensor_split(mc2_mask, tp_size, dim=0)
                 mc2_mask = chunk_mc2_mask[tp_rank]
+
         if self.dp_size > 1 and fused_moe_state == FusedMoEState.AllGather:
             # NOTE: When in torchair graph, it has been padded in model_runner_v1
             if not self.torchair_graph_enabled or is_prefill:
diff --git a/vllm_ascend/quantization/w8a8_dynamic.py b/vllm_ascend/quantization/w8a8_dynamic.py
@@ -233,7 +233,7 @@ def fused_experts_with_mc2(
     # NOTE: Currently, when in A3 or in torchair graph, we need to pass in some extra param into dispatch & combine
     need_extra_args = (get_ascend_soc_version() == AscendSocVersion.A3
                        or is_torchair)
-    
+
     # NOTE: Currently, when in A3, we need to pass in some extra param into dispatch & combine
     a3_need_extra_args = get_ascend_soc_version() == AscendSocVersion.A3
 
@@ -803,6 +803,7 @@ def apply(
         topk_weights = topk_weights.to(x.dtype)
 
         if fused_moe_state == FusedMoEState.MC2:
+            mc2_mask = kwargs.get("mc2_mask", None)
             return fused_experts_with_mc2(
                 hidden_states=x,
                 w1=layer.w13_weight,
@@ -819,7 +820,8 @@ def apply(
                 shared_experts=shared_experts,
                 is_torchair=self.torchair_graph_enabled,
                 quantized_x_for_share=shared_gate_up,
-                dynamic_scale_for_share=shared_dequant_scale)
+                dynamic_scale_for_share=shared_dequant_scale,
+                mc2_mask=mc2_mask)
         elif fused_moe_state == FusedMoEState.AllGather:
             return fused_experts(hidden_states=x,
                                  w1=layer.w13_weight,
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
@@ -1666,6 +1666,8 @@ def _dummy_run(
                             attn_metadata.decode.block_table)
                         torch._dynamo.mark_static(
                             attn_metadata.decode.input_positions)
+                        torch._dynamo.mark_static(attn_metadata.decode.sin)
+                        torch._dynamo.mark_static(attn_metadata.decode.cos)
                         torch._dynamo.mark_static(
                             attn_metadata.decode.mc2_mask)
                         torch._dynamo.mark_static(attn_metadata.slot_mapping)