[MLA][Graph] Improve assertion on Graph mode with MLA (vllm-project#933)

MengqingCao · yangcheng (AJ) · commit ac998d5e4760 · 2025-07-07T23:03:42.000+08:00
Improve assertion on Graph mode with MLA.

When running deepseek with graph mode, the fused MLA op only support
`numHeads / numKvHeads ∈ {32, 64, 128}`, thus we improve the assertion
info here to avoid users confused with this.

Adjusting tp size is required when running deepseek-v3/r1 with graph
mode. deepseek-v2-lite is not supported in graph mode.

Test locally as the CI machine could not run V3 due to the HBM limits.

---------

Signed-off-by: MengqingCao &lt;cmq0113@163.com&gt;
diff --git a/vllm_ascend/attention/attention.py b/vllm_ascend/attention/attention.py
@@ -40,6 +40,8 @@
 from vllm_ascend.worker.model_runner import (
     ModelInputForNPUBuilder, ModelInputForNPUWithSamplingMetadata)
 
+_ALLOWED_NUM_QUERIES_PER_KV = [32, 64, 128]
+
 
 def generate_attn_mask(max_seq_len: int, dtype=torch.float16, mask_value=None):
     # Construct lower triangle matrix.
@@ -1005,6 +1007,15 @@ def __init__(
         ascend_config = get_ascend_config()
         self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled
 
+        # TODO: support numHeads / numKvHeads < 16 in MLA kernel
+        if self.torchair_graph_enabled:
+            assert self.num_queries_per_kv in _ALLOWED_NUM_QUERIES_PER_KV, \
+                ("The allowed number of queries per kv when enabling both MLA and Graph mode"
+                " only support {32, 64, 128}, Thus this is not supported for DeepSeek-V2-Lite,"
+                " as it only has 16 attention heads. And if you're using DeepSeek-V3 or DeepSeek-R1,"
+                " please make sure after the tensor parallel split, num_heads / num_kv_heads in "
+                "{32, 64, 128}.")
+
     def exec_kv(
         self,
         hidden_states: torch.Tensor,
diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py
@@ -15,6 +15,7 @@
 
 from vllm_ascend import envs
 from vllm_ascend.ascend_config import get_ascend_config
+from vllm_ascend.attention.attention import _ALLOWED_NUM_QUERIES_PER_KV
 from vllm_ascend.attention.attention_v1 import AscendAttentionState
 from vllm_ascend.attention.utils import \
     AscendCommonAttentionMetadata as CommonAttentionMetadata
@@ -585,6 +586,15 @@ def __init__(
             self.spec_token_num = speculative_config.num_speculative_tokens
             assert self.spec_token_num > 0
 
+        # TODO: support numHeads / numKvHeads < 16 in MLA kernel
+        if self.torchair_graph_enabled:
+            assert self.num_queries_per_kv in _ALLOWED_NUM_QUERIES_PER_KV, \
+                ("The allowed number of queries per kv when enabling both MLA and Graph mode"
+                " only support {32, 64, 128}, Thus this is not supported for DeepSeek-V2-Lite,"
+                " as it only has 16 attention heads. And if you're using DeepSeek-V3 or DeepSeek-R1,"
+                " please make sure after the tensor parallel split, num_heads / num_kv_heads in "
+                "{32, 64, 128}.")
+
     def _v_up_proj_and_o_proj(self, x):
         # Convert from (B, N, L) to (N, B, L)
         x = x.view(-1, self.num_heads, self.kv_lora_rank).transpose(0, 1)