|
15 | 15 |
|
16 | 16 | from vllm_ascend import envs
|
17 | 17 | from vllm_ascend.ascend_config import get_ascend_config
|
18 |
| -from vllm_ascend.attention.attention import _ALLOWED_NUM_QUERIES_PER_KV |
19 | 18 | from vllm_ascend.attention.attention_v1 import AscendAttentionState
|
20 | 19 | from vllm_ascend.multistream.base import MSAttentionMetadataSplitConfig
|
21 | 20 | from vllm_ascend.multistream.context import get_multistream_comm_context
|
@@ -568,15 +567,6 @@ def __init__(
|
568 | 567 | self.spec_token_num = speculative_config.num_speculative_tokens
|
569 | 568 | assert self.spec_token_num > 0
|
570 | 569 |
|
571 |
| - # TODO: support numHeads / numKvHeads < 16 in MLA kernel |
572 |
| - if self.torchair_graph_enabled: |
573 |
| - assert self.num_queries_per_kv in _ALLOWED_NUM_QUERIES_PER_KV, \ |
574 |
| - ("The allowed number of queries per kv when enabling both MLA and Graph mode" |
575 |
| - " only support {32, 64, 128}, Thus this is not supported for DeepSeek-V2-Lite," |
576 |
| - " as it only has 16 attention heads. And if you're using DeepSeek-V3 or DeepSeek-R1," |
577 |
| - " please make sure after the tensor parallel split, num_heads / num_kv_heads in " |
578 |
| - "{32, 64, 128}.") |
579 |
| - |
580 | 570 | def _v_up_proj_and_o_proj(self, x):
|
581 | 571 | # Convert from (B, N, L) to (N, B, L)
|
582 | 572 | x = x.view(-1, self.num_heads, self.kv_lora_rank).transpose(0, 1)
|
|
0 commit comments