fix some bugs

NINGBENZHE · NINGBENZHE · commit 2a308fa21e65 · 2025-05-29T20:34:49.000+08:00
Signed-off-by: ningbenzhe1 &lt;ningbenzhe@huawei.com&gt;
diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py
@@ -129,6 +129,9 @@ class AscendMetadata:
     attn_state: AscendAttentionState = AscendAttentionState.ChunkedPrefill
     attn_mask: Optional[torch.Tensor] = None
 
+    # For logging.
+    num_input_tokens: int = 0  # Number of tokens including padding.
+
 
 class AscendAttentionMetadataBuilder:
 
diff --git a/vllm_ascend/distributed/parallel_state.py b/vllm_ascend/distributed/parallel_state.py
@@ -21,12 +21,18 @@ def get_etp_group() -> GroupCoordinator:
     return _ETP
 
 
+def model_parallel_initialized():
+    return (_ETP is not None and _EP is not None)
+
+
 def init_ascend_model_parallel(
     tensor_model_parallel_size: int = 1,
     pipeline_model_parallel_size: int = 1,
     expert_tensor_parallel_size: int = 1,
     backend: Optional[str] = None,
 ):
+    if model_parallel_initialized():
+        return
     assert torch.distributed.is_initialized()
     world_size: int = torch.distributed.get_world_size()
     backend = backend or torch.distributed.get_backend(
diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py
@@ -66,8 +66,7 @@ def fused_experts_with_mc2(
     local_rank = torch.distributed.get_rank(group=ep_group)
     all_to_all_group_size = torch.distributed.get_world_size(ep_group)
 
-    world_szie = torch.distributed.get_world_size()
-    tp_size = world_szie // all_to_all_group_size
+    tp_size = get_etp_group().world_size
     tp_rank = rank % tp_size
 
     stage1_kwargs = {
diff --git a/vllm_ascend/patch/__init__.py b/vllm_ascend/patch/__init__.py
@@ -70,13 +70,14 @@
 #       on multi-node dp inference implementation
 #   4. `ParallelConfig.stateless_init_dp_group`
 #    Why:
-#       vLLM use gloo backend by default to initialize stateless dp process gourp, but we want to use hccl here to
-#       get better performance
+#       vLLM use gloo backend by default to initialize stateless dp process group, but we want to use hccl here to
+#       get better performance. Initialize the global variable of dp_group to prefill dummy_run.
 #    How：
-#       adopt nccl backend to init process group
+#       adopt nccl backend to init process group and add the global variable of dp_group.
 #    Related PR (if no, explain why): no related PR, we want add this ability into vllm
 #    Future Plan:
 #       Remove those patch when vllm merged them
+#       Add the global variable of dp_group in platform when vllm merged them.
 #
 #
 # * Worker Patch:
diff --git a/vllm_ascend/patch/platform/patch_common/patch_distributed.py b/vllm_ascend/patch/platform/patch_common/patch_distributed.py
@@ -20,6 +20,7 @@
 import torch
 import vllm
 import vllm.distributed
+import vllm.envs as envs
 from torch.distributed import ProcessGroup
 from torch.distributed.distributed_c10d import (Backend, PrefixStore,
                                                 _get_default_timeout,
@@ -164,10 +165,9 @@ def parallel_config_get_dp_port(self) -> int:
     """
     answer = self.data_parallel_master_port
     self.data_parallel_master_port += 1
-    import os
 
     # NOTE: Get port from envs directly when using torchrun
-    port = int(os.environ.get("MASTER_PORT", answer))  # type: ignore
+    port = envs.VLLM_DP_MASTER_PORT if envs.VLLM_DP_MASTER_PORT else answer
     return port
 
 
diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py
@@ -173,7 +173,7 @@ def execute_model(
         scheduler_output: "SchedulerOutput",
     ) -> Optional[ModelRunnerOutput]:
         output = self.model_runner.execute_model(scheduler_output)
-        return output if self.rank == 0 else None
+        return output if self.is_driver_worker else None
 
     def load_model(self) -> None:
         self.model_runner.load_model()