Skip to content

Commit e8bbe72

Browse files
[Bug fix] fix attention rank init (#2743)
* fix attention rank init * fix attention rank init
1 parent 57b086d commit e8bbe72

File tree

4 files changed

+17
-13
lines changed

4 files changed

+17
-13
lines changed

fastdeploy/model_executor/layers/attention/append_attn_backend.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int,
9191
self.use_speculate: bool = self.speculative_method is not None
9292
self.speculate_max_draft_token_num: int = fd_config.speculative_config.num_speculative_tokens
9393
self.keep_pd_step_flag: bool = fd_config.speculative_config.model_type == "mtp"
94-
self.rank: int = fd_config.parallel_config.tensor_parallel_rank
94+
self.rank: int = fd_config.parallel_config.expert_parallel_rank * fd_config.parallel_config.tensor_parallel_degree + fd_config.parallel_config.tensor_parallel_rank
9595

9696
self.kv_num_heads: int = kv_num_heads
9797
self.num_heads: int = num_heads
@@ -108,12 +108,12 @@ def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int,
108108

109109
if fd_config.parallel_config.expert_parallel_rank is None:
110110
fd_config.parallel_config.expert_parallel_rank = 0
111-
device_id = self.rank + fd_config.parallel_config.tensor_parallel_degree * \
112-
fd_config.parallel_config.expert_parallel_rank
113111
if self.device_id is None:
114-
self.device_id = device_id
112+
self.device_id = self.rank
115113
else:
116-
self.device_id = self.device_id.split(",")[device_id]
114+
device_ids = self.device_id.split(",")
115+
rank_index = self.rank % len(device_ids)
116+
self.device_id = self.device_id[rank_index]
117117

118118
def init_attention_metadata(self, forward_meta: ForwardMeta):
119119
"""Initialize attntion metadata hence all layers in the forward pass can reuse it."""

fastdeploy/model_executor/layers/attention/flash_attn_backend.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int,
100100
self.use_speculate = self.speculative_method is not None
101101
self.speculate_max_draft_token_num = fd_config.speculative_config.num_speculative_tokens
102102
self.keep_pd_step_flag: bool = fd_config.speculative_config.model_type == "mtp"
103-
self.rank: int = fd_config.parallel_config.tensor_parallel_rank
103+
self.rank: int = fd_config.parallel_config.expert_parallel_rank * fd_config.parallel_config.tensor_parallel_degree + fd_config.parallel_config.tensor_parallel_rank
104104

105105
# pd_disaggregation
106106
self.use_pd_disaggregation: int = int(
@@ -110,12 +110,13 @@ def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int,
110110

111111
if fd_config.parallel_config.expert_parallel_rank is None:
112112
fd_config.parallel_config.expert_parallel_rank = 0
113-
device_id = self.rank + fd_config.parallel_config.tensor_parallel_degree * \
114-
fd_config.parallel_config.expert_parallel_rank
113+
115114
if self.device_id is None:
116-
self.device_id = device_id
115+
self.device_id = self.rank
117116
else:
118-
self.device_id = self.device_id.split(",")[device_id]
117+
device_ids = self.device_id.split(",")
118+
rank_index = self.rank % len(device_ids)
119+
self.device_id = self.device_id[rank_index]
119120

120121
def get_attntion_meta(self):
121122
"""get_attntion_meta"""

fastdeploy/model_executor/layers/attention/mla_attention_backend.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int,
109109
self.use_speculate: bool = self.speculative_method is not None
110110
self.speculate_max_draft_token_num: int = fd_config.speculative_config.num_speculative_tokens
111111
self.keep_pd_step_flag: bool = fd_config.speculative_config.model_type == "mtp"
112-
self.rank: int = fd_config.parallel_config.tensor_parallel_rank
112+
self.rank: int = fd_config.parallel_config.expert_parallel_rank * fd_config.parallel_config.tensor_parallel_degree + fd_config.parallel_config.tensor_parallel_rank
113113

114114
self.kv_num_heads: int = kv_num_heads
115115
self.num_heads: int = num_heads
@@ -135,10 +135,13 @@ def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int,
135135
os.getenv("FLAGS_use_pd_disaggregation", 0))
136136
self.start_layer_index: int = fd_config.model_config.start_layer_index
137137
self.device_id: int = os.getenv("CUDA_VISIBLE_DEVICES", None)
138+
138139
if self.device_id is None:
139140
self.device_id = self.rank
140141
else:
141-
self.device_id = self.device_id.split(",")[self.rank]
142+
device_ids = self.device_id.split(",")
143+
rank_index = self.rank % len(device_ids)
144+
self.device_id = self.device_id[rank_index]
142145

143146
def init_attention_metadata(self, forward_meta: ForwardMeta):
144147
"""Initialize attention metadata hence all layers in the forward pass can reuse it."""

fastdeploy/model_executor/layers/attention/xpu_attn_backend.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int,
9191
# self.use_speculate = self.speculate_method is not None
9292
# self.speculate_max_draft_token_num = fd_config.parallel_config.speculate_max_draft_tokens
9393
self.keep_pd_step_flag: bool = fd_config.speculative_config.model_type == "mtp"
94-
self.rank: int = fd_config.parallel_config.tensor_parallel_rank
94+
self.rank: int = fd_config.parallel_config.expert_parallel_rank * fd_config.parallel_config.tensor_parallel_degree + fd_config.parallel_config.tensor_parallel_rank
9595

9696
self.kv_num_heads: int = kv_num_heads
9797
self.num_heads: int = num_heads

0 commit comments

Comments
 (0)