[MLA][Graph] Improve assertion on Graph mode with MLA

MengqingCao · MengqingCao · commit 5a8c48246f41 · 2025-05-22T12:27:33.000Z
Signed-off-by: MengqingCao &lt;cmq0113@163.com&gt;
diff --git a/vllm_ascend/attention/attention.py b/vllm_ascend/attention/attention.py
@@ -947,6 +947,9 @@ def forward(
         return output.view(num_tokens, self.hidden_size)
 
 
+ALLOWED_NUM_QUERIES_PER_KV = [32, 64, 128]
+
+
 class AscendMLAAttentionBackendImpl(MLAAttentionImpl):
 
     def __init__(
@@ -1005,6 +1008,13 @@ def __init__(
         if additional_config:
             self.enable_graph_mode = additional_config.get(
                 "enable_graph_mode", False)
+        if self.enable_graph_mode:
+            assert self.num_queries_per_kv in ALLOWED_NUM_QUERIES_PER_KV, \
+                ("The allowed number of queries per kv when enabling both MLA and Graph mode"
+                " only support {32, 64, 128}, Thus this is not supported for DeepSeek-V2-Lite,"
+                " as it only has 16 attention heads. And if you're using DeepSeek-V3 or DeepSeek-R1,"
+                " please make sure after the tensor parallel split, num_heads / num_kv_heads in "
+                "{32, 64, 128}.")
 
     def exec_kv(
         self,
diff --git a/vllm_ascend/worker/multi_step_worker.py b/vllm_ascend/worker/multi_step_worker.py
@@ -119,7 +119,7 @@ def _prepare_last_sampled_token_ids_for_tp_workers(
             # execute_model_req
             assert execute_model_req.last_sampled_token_ids is not None
             model_input.last_sampled_token_ids = (
-                execute_model_req.last_sampled_token_ids.cuda())
+                execute_model_req.last_sampled_token_ids.npu())
             model_input.add_sampler_output(
                 SamplerOutput(outputs=[], sampled_token_ids=None),
                 model_input.last_sampled_token_ids)