Skip to content

Commit 6610aa2

Browse files
Revert "[Bug fix] fix attention rank init (#2743)" (#2761)
This reverts commit e8bbe72.
1 parent f72c4de commit 6610aa2

File tree

4 files changed

+13
-17
lines changed

4 files changed

+13
-17
lines changed

fastdeploy/model_executor/layers/attention/append_attn_backend.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int,
9191
self.use_speculate: bool = self.speculative_method is not None
9292
self.speculate_max_draft_token_num: int = fd_config.speculative_config.num_speculative_tokens
9393
self.keep_pd_step_flag: bool = fd_config.speculative_config.model_type == "mtp"
94-
self.rank: int = fd_config.parallel_config.expert_parallel_rank * fd_config.parallel_config.tensor_parallel_degree + fd_config.parallel_config.tensor_parallel_rank
94+
self.rank: int = fd_config.parallel_config.tensor_parallel_rank
9595

9696
self.kv_num_heads: int = kv_num_heads
9797
self.num_heads: int = num_heads
@@ -108,12 +108,12 @@ def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int,
108108

109109
if fd_config.parallel_config.expert_parallel_rank is None:
110110
fd_config.parallel_config.expert_parallel_rank = 0
111+
device_id = self.rank + fd_config.parallel_config.tensor_parallel_degree * \
112+
fd_config.parallel_config.expert_parallel_rank
111113
if self.device_id is None:
112-
self.device_id = self.rank
114+
self.device_id = device_id
113115
else:
114-
device_ids = self.device_id.split(",")
115-
rank_index = self.rank % len(device_ids)
116-
self.device_id = self.device_id[rank_index]
116+
self.device_id = self.device_id.split(",")[device_id]
117117

118118
def init_attention_metadata(self, forward_meta: ForwardMeta):
119119
"""Initialize attntion metadata hence all layers in the forward pass can reuse it."""

fastdeploy/model_executor/layers/attention/flash_attn_backend.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int,
100100
self.use_speculate = self.speculative_method is not None
101101
self.speculate_max_draft_token_num = fd_config.speculative_config.num_speculative_tokens
102102
self.keep_pd_step_flag: bool = fd_config.speculative_config.model_type == "mtp"
103-
self.rank: int = fd_config.parallel_config.expert_parallel_rank * fd_config.parallel_config.tensor_parallel_degree + fd_config.parallel_config.tensor_parallel_rank
103+
self.rank: int = fd_config.parallel_config.tensor_parallel_rank
104104

105105
# pd_disaggregation
106106
self.use_pd_disaggregation: int = int(
@@ -110,13 +110,12 @@ def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int,
110110

111111
if fd_config.parallel_config.expert_parallel_rank is None:
112112
fd_config.parallel_config.expert_parallel_rank = 0
113-
113+
device_id = self.rank + fd_config.parallel_config.tensor_parallel_degree * \
114+
fd_config.parallel_config.expert_parallel_rank
114115
if self.device_id is None:
115-
self.device_id = self.rank
116+
self.device_id = device_id
116117
else:
117-
device_ids = self.device_id.split(",")
118-
rank_index = self.rank % len(device_ids)
119-
self.device_id = self.device_id[rank_index]
118+
self.device_id = self.device_id.split(",")[device_id]
120119

121120
def get_attntion_meta(self):
122121
"""get_attntion_meta"""

fastdeploy/model_executor/layers/attention/mla_attention_backend.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int,
109109
self.use_speculate: bool = self.speculative_method is not None
110110
self.speculate_max_draft_token_num: int = fd_config.speculative_config.num_speculative_tokens
111111
self.keep_pd_step_flag: bool = fd_config.speculative_config.model_type == "mtp"
112-
self.rank: int = fd_config.parallel_config.expert_parallel_rank * fd_config.parallel_config.tensor_parallel_degree + fd_config.parallel_config.tensor_parallel_rank
112+
self.rank: int = fd_config.parallel_config.tensor_parallel_rank
113113

114114
self.kv_num_heads: int = kv_num_heads
115115
self.num_heads: int = num_heads
@@ -135,13 +135,10 @@ def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int,
135135
os.getenv("FLAGS_use_pd_disaggregation", 0))
136136
self.start_layer_index: int = fd_config.model_config.start_layer_index
137137
self.device_id: int = os.getenv("CUDA_VISIBLE_DEVICES", None)
138-
139138
if self.device_id is None:
140139
self.device_id = self.rank
141140
else:
142-
device_ids = self.device_id.split(",")
143-
rank_index = self.rank % len(device_ids)
144-
self.device_id = self.device_id[rank_index]
141+
self.device_id = self.device_id.split(",")[self.rank]
145142

146143
def init_attention_metadata(self, forward_meta: ForwardMeta):
147144
"""Initialize attention metadata hence all layers in the forward pass can reuse it."""

fastdeploy/model_executor/layers/attention/xpu_attn_backend.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int,
9191
# self.use_speculate = self.speculate_method is not None
9292
# self.speculate_max_draft_token_num = fd_config.parallel_config.speculate_max_draft_tokens
9393
self.keep_pd_step_flag: bool = fd_config.speculative_config.model_type == "mtp"
94-
self.rank: int = fd_config.parallel_config.expert_parallel_rank * fd_config.parallel_config.tensor_parallel_degree + fd_config.parallel_config.tensor_parallel_rank
94+
self.rank: int = fd_config.parallel_config.tensor_parallel_rank
9595

9696
self.kv_num_heads: int = kv_num_heads
9797
self.num_heads: int = num_heads

0 commit comments

Comments
 (0)