avoid performing index selection of sin/cos cache every layer

whx-sjtu · whx-sjtu · commit cdae151cbfc9 · 2025-07-03T19:54:32.000+08:00
Signed-off-by: whx-sjtu &lt;2952154980@qq.com&gt;
diff --git a/vllm_ascend/ascend_forward_context.py b/vllm_ascend/ascend_forward_context.py
@@ -7,6 +7,8 @@
 from vllm.distributed import get_dp_group
 from vllm.forward_context import get_forward_context, set_forward_context
 
+from vllm_ascend.ascend_config import get_ascend_config
+
 
 class FusedMoEState(Enum):
     AllGather = 0
@@ -55,6 +57,15 @@ def set_ascend_forward_context(
 
         forward_context.in_profile_run = in_profile_run
 
+        ascend_config = get_ascend_config()
+        from vllm_ascend.attention.attention_v1 import AscendAttentionState
+        forward_context.running_in_graph = ascend_config.torchair_graph_config.enabled and \
+                                attn_metadata and \
+                                attn_metadata.attn_state in [
+                                    AscendAttentionState.DecodeOnly,
+                                    AscendAttentionState.SpecDecoding
+                                ]
+
         dp_world_size = get_dp_group().world_size
         if dp_world_size > 1 and forward_context.dp_metadata is not None:
             forward_context.max_tokens_across_dp = forward_context.dp_metadata.max_tokens_across_dp_cpu.item(
diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py
@@ -9,6 +9,7 @@
                                               MLAAttentionImpl)
 from vllm.attention.backends.utils import PAD_SLOT_ID
 from vllm.config import get_current_vllm_config
+from vllm.forward_context import get_forward_context
 from vllm.model_executor.layers.linear import (LinearBase,
                                                UnquantizedLinearMethod)
 from vllm.utils import cdiv, round_down
@@ -1042,9 +1043,7 @@ def forward(
         if attn_metadata is None:
             # Profiling run.
             return output
-        self.running_in_graph = self.torchair_graph_enabled and attn_metadata.attn_state in [
-            AscendAttentionState.DecodeOnly, AscendAttentionState.SpecDecoding
-        ]
+        self.running_in_graph = get_forward_context().running_in_graph
         num_actual_toks = attn_metadata.num_actual_tokens
         if k_pe is None and not self.running_in_graph:
             kv_c, k_pe = self.kv_a_proj_with_mqa(
@@ -1082,15 +1081,8 @@ def forward(
             decode_k_nope = None
             assert attn_metadata.decode is not None
             if self.running_in_graph:
-                seq_len = self.rotary_emb.max_position_embeddings * self.rotary_emb.scaling_factor
-                cos = self.rotary_emb.cos_cached[:seq_len].to(
-                    dtype=decode_hs_or_q_c.dtype)
-                sin = self.rotary_emb.sin_cached[:seq_len].to(
-                    dtype=decode_hs_or_q_c.dtype)
-                cos = cos[attn_metadata.decode.input_positions]
-                sin = sin[attn_metadata.decode.input_positions]
-                cos = cos[:, None, None, :]
-                sin = sin[:, None, None, :]
+                cos = attn_metadata.decode.cos
+                sin = attn_metadata.decode.sin
                 # Without explicitly controlling the order, IndexByTensor operations
                 # would be placed after `matmul W_KV_T` hindering the overlapping of
                 # KvRmsNormRopeCache and SingleRope.
@@ -1125,15 +1117,8 @@ def forward(
             prefill_q_nope = prefill_q[..., :self.qk_nope_head_dim]
             if self.torchair_graph_enabled:
                 num_tokens = prefill_hs_or_q_c.shape[0]
-                seq_len = self.rotary_emb.max_position_embeddings * self.rotary_emb.scaling_factor
-                cos = self.rotary_emb.cos_cached[:seq_len].to(
-                    dtype=prefill_q_pe.dtype)
-                sin = self.rotary_emb.sin_cached[:seq_len].to(
-                    dtype=prefill_q_pe.dtype)
-                cos = cos[attn_metadata.prefill.input_positions]
-                sin = sin[attn_metadata.prefill.input_positions]
-                cos = cos[:, None, None, :]
-                sin = sin[:, None, None, :]
+                cos = attn_metadata.prefill.cos
+                sin = attn_metadata.prefill.sin
 
                 prefill_q_pe = self.rope_single(prefill_q_pe, cos, sin)
                 prefill_k_pe, prefill_k_nope = self.exec_kv_prefill(
diff --git a/vllm_ascend/models/deepseek_v2.py b/vllm_ascend/models/deepseek_v2.py
@@ -676,6 +676,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             make_empty_intermediate_tensors_factory(
                 ["hidden_states", "residual"], config.hidden_size))
+        ascend_config = get_ascend_config()
+        self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled
+        self.cos_cached = self.layers[
+            self.start_layer].self_attn.rotary_emb.cos_cached
+        self.sin_cached = self.layers[
+            self.start_layer].self_attn.rotary_emb.sin_cached
 
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
@@ -700,6 +706,24 @@ def forward(
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
 
+        forward_context = get_forward_context()
+        # Index select sin/cos for rope here.
+        if attn_metadata is not None:
+            if attn_metadata.num_decodes > 0 and forward_context.running_in_graph:
+                cos = self.cos_cached.to(dtype=hidden_states.dtype)
+                sin = self.sin_cached.to(dtype=hidden_states.dtype)
+                cos = cos[attn_metadata.decode.input_positions]
+                sin = sin[attn_metadata.decode.input_positions]
+                attn_metadata.decode.cos = cos[:, None, None, :]
+                attn_metadata.decode.sin = sin[:, None, None, :]
+            if attn_metadata.num_prefills > 0 and self.torchair_graph_enabled:
+                cos = self.cos_cached.to(dtype=hidden_states.dtype)
+                sin = self.sin_cached.to(dtype=hidden_states.dtype)
+                cos = cos[attn_metadata.prefill.input_positions]
+                sin = sin[attn_metadata.prefill.input_positions]
+                attn_metadata.prefill.cos = cos[:, None, None, :]
+                attn_metadata.prefill.sin = sin[:, None, None, :]
+
         for i in range(self.start_layer, self.end_layer):
             layer = self.layers[i]
             hidden_states, residual = layer(