File tree Expand file tree Collapse file tree 6 files changed +17
-8
lines changed Expand file tree Collapse file tree 6 files changed +17
-8
lines changed Original file line number Diff line number Diff line change @@ -129,6 +129,9 @@ class AscendMetadata:
129
129
attn_state : AscendAttentionState = AscendAttentionState .ChunkedPrefill
130
130
attn_mask : Optional [torch .Tensor ] = None
131
131
132
+ # For logging.
133
+ num_input_tokens : int = 0 # Number of tokens including padding.
134
+
132
135
133
136
class AscendAttentionMetadataBuilder :
134
137
Original file line number Diff line number Diff line change @@ -21,12 +21,18 @@ def get_etp_group() -> GroupCoordinator:
21
21
return _ETP
22
22
23
23
24
+ def model_parallel_initialized ():
25
+ return (_ETP is not None and _EP is not None )
26
+
27
+
24
28
def init_ascend_model_parallel (
25
29
tensor_model_parallel_size : int = 1 ,
26
30
pipeline_model_parallel_size : int = 1 ,
27
31
expert_tensor_parallel_size : int = 1 ,
28
32
backend : Optional [str ] = None ,
29
33
):
34
+ if model_parallel_initialized ():
35
+ return
30
36
assert torch .distributed .is_initialized ()
31
37
world_size : int = torch .distributed .get_world_size ()
32
38
backend = backend or torch .distributed .get_backend (
Original file line number Diff line number Diff line change @@ -66,8 +66,7 @@ def fused_experts_with_mc2(
66
66
local_rank = torch .distributed .get_rank (group = ep_group )
67
67
all_to_all_group_size = torch .distributed .get_world_size (ep_group )
68
68
69
- world_szie = torch .distributed .get_world_size ()
70
- tp_size = world_szie // all_to_all_group_size
69
+ tp_size = get_etp_group ().world_size
71
70
tp_rank = rank % tp_size
72
71
73
72
stage1_kwargs = {
Original file line number Diff line number Diff line change 70
70
# on multi-node dp inference implementation
71
71
# 4. `ParallelConfig.stateless_init_dp_group`
72
72
# Why:
73
- # vLLM use gloo backend by default to initialize stateless dp process gourp , but we want to use hccl here to
74
- # get better performance
73
+ # vLLM use gloo backend by default to initialize stateless dp process group , but we want to use hccl here to
74
+ # get better performance. Initialize the global variable of dp_group to prefill dummy_run.
75
75
# How:
76
- # adopt nccl backend to init process group
76
+ # adopt nccl backend to init process group and add the global variable of dp_group.
77
77
# Related PR (if no, explain why): no related PR, we want add this ability into vllm
78
78
# Future Plan:
79
79
# Remove those patch when vllm merged them
80
+ # Add the global variable of dp_group in platform when vllm merged them.
80
81
#
81
82
#
82
83
# * Worker Patch:
Original file line number Diff line number Diff line change 20
20
import torch
21
21
import vllm
22
22
import vllm .distributed
23
+ import vllm .envs as envs
23
24
from torch .distributed import ProcessGroup
24
25
from torch .distributed .distributed_c10d import (Backend , PrefixStore ,
25
26
_get_default_timeout ,
@@ -164,10 +165,9 @@ def parallel_config_get_dp_port(self) -> int:
164
165
"""
165
166
answer = self .data_parallel_master_port
166
167
self .data_parallel_master_port += 1
167
- import os
168
168
169
169
# NOTE: Get port from envs directly when using torchrun
170
- port = int ( os . environ . get ( "MASTER_PORT" , answer )) # type: ignore
170
+ port = envs . VLLM_DP_MASTER_PORT if envs . VLLM_DP_MASTER_PORT else answer
171
171
return port
172
172
173
173
Original file line number Diff line number Diff line change @@ -173,7 +173,7 @@ def execute_model(
173
173
scheduler_output : "SchedulerOutput" ,
174
174
) -> Optional [ModelRunnerOutput ]:
175
175
output = self .model_runner .execute_model (scheduler_output )
176
- return output if self .rank == 0 else None
176
+ return output if self .is_driver_worker else None
177
177
178
178
def load_model (self ) -> None :
179
179
self .model_runner .load_model ()
You can’t perform that action at this time.
0 commit comments