Skip to content

Commit 8d1e59c

Browse files
authored
[BugFix] Fix the problem that torchair doesn't support tp > 4. (#1404)
This PR removes the restriction that TP cannot be greater than 4 in torchair scenario, because current newest version of CANN has fixed this bug. This PR is only for test now and will be merged until the next CANN version is released. --------- Signed-off-by: whx-sjtu <2952154980@qq.com>
1 parent 105d2df commit 8d1e59c

File tree

2 files changed

+0
-21
lines changed

2 files changed

+0
-21
lines changed

vllm_ascend/attention/attention.py

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,6 @@
4040
from vllm_ascend.worker.model_runner import (
4141
ModelInputForNPUBuilder, ModelInputForNPUWithSamplingMetadata)
4242

43-
_ALLOWED_NUM_QUERIES_PER_KV = [32, 64, 128]
44-
4543

4644
def generate_attn_mask(max_seq_len: int, dtype=torch.float16, mask_value=None):
4745
# Construct lower triangle matrix.
@@ -1007,15 +1005,6 @@ def __init__(
10071005
ascend_config = get_ascend_config()
10081006
self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled
10091007

1010-
# TODO: support numHeads / numKvHeads < 16 in MLA kernel
1011-
if self.torchair_graph_enabled:
1012-
assert self.num_queries_per_kv in _ALLOWED_NUM_QUERIES_PER_KV, \
1013-
("The allowed number of queries per kv when enabling both MLA and Graph mode"
1014-
" only support {32, 64, 128}, Thus this is not supported for DeepSeek-V2-Lite,"
1015-
" as it only has 16 attention heads. And if you're using DeepSeek-V3 or DeepSeek-R1,"
1016-
" please make sure after the tensor parallel split, num_heads / num_kv_heads in "
1017-
"{32, 64, 128}.")
1018-
10191008
def exec_kv(
10201009
self,
10211010
hidden_states: torch.Tensor,

vllm_ascend/attention/mla_v1.py

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@
1515

1616
from vllm_ascend import envs
1717
from vllm_ascend.ascend_config import get_ascend_config
18-
from vllm_ascend.attention.attention import _ALLOWED_NUM_QUERIES_PER_KV
1918
from vllm_ascend.attention.attention_v1 import AscendAttentionState
2019
from vllm_ascend.multistream.base import MSAttentionMetadataSplitConfig
2120
from vllm_ascend.multistream.context import get_multistream_comm_context
@@ -568,15 +567,6 @@ def __init__(
568567
self.spec_token_num = speculative_config.num_speculative_tokens
569568
assert self.spec_token_num > 0
570569

571-
# TODO: support numHeads / numKvHeads < 16 in MLA kernel
572-
if self.torchair_graph_enabled:
573-
assert self.num_queries_per_kv in _ALLOWED_NUM_QUERIES_PER_KV, \
574-
("The allowed number of queries per kv when enabling both MLA and Graph mode"
575-
" only support {32, 64, 128}, Thus this is not supported for DeepSeek-V2-Lite,"
576-
" as it only has 16 attention heads. And if you're using DeepSeek-V3 or DeepSeek-R1,"
577-
" please make sure after the tensor parallel split, num_heads / num_kv_heads in "
578-
"{32, 64, 128}.")
579-
580570
def _v_up_proj_and_o_proj(self, x):
581571
# Convert from (B, N, L) to (N, B, L)
582572
x = x.view(-1, self.num_heads, self.kv_lora_rank).transpose(0, 1)

0 commit comments

Comments
 (0)