Merge branch 'br_main_into_eplb' of https://github.com/raindaywhu/vllm-ascend into br_main_into_eplb

lt · lt · commit 83f2d51b7414 · 2025-06-22T16:51:34.000+08:00
diff --git a/vllm_ascend/eplb/adaptor/vllm_adaptor.py b/vllm_ascend/eplb/adaptor/vllm_adaptor.py
@@ -152,11 +152,13 @@ def local2global(self,
         return placement_global
 
     def get_init_expert_map_from_file(self, num_moe_layers, expert_map_path):
-        if os.path.exists(expert_map_path):
+
+        try:
             expert_map_tensor, layers_num, ranks_num = self._expert_file_to_tensor(expert_map_path)
             expert_map_all = self.local2global(expert_map_tensor)
-        else:
+        except (TypeError, FileNotFoundError, OSError):
             expert_map_all = self.determine_expert_map_all()
+
         for layer_idx in range(num_moe_layers):
             self.expert_map_per_layer_cpu[layer_idx+3] = \
                 expert_map_all[layer_idx][self.rank_id]
diff --git a/vllm_ascend/eplb/core/policy/dynamic_ep_v2.py b/vllm_ascend/eplb/core/policy/dynamic_ep_v2.py
@@ -307,61 +307,6 @@ def calculate_initial_imbalance(global_deployment, new_layer_workloads):
 
         return layer_imbalance
 
-    def rebalance_experts(self, current_expert_table, expert_workload):
-
-        info = DynamicTable()
-        info.workload_table = np.array(expert_workload)
-        info.placement_table = np.array(current_expert_table)
-        layer_num, num_npus, experts_per_npu = info.workload_table.shape
-        expert_ids, counts = np.unique(info.placement_table[0], return_counts=True)
-        num_redundancy_expert = self.get_redundant_num(num_npus, counts)
-        num_original_expert = len(expert_ids)
-        layer_workloads = self.add_redundant(info.placement_table, info.workload_table, num_original_expert)
-        max_heat_per_layer_before = self.calculate_max_heat_per_layer(info.workload_table, layer_num)
-        npu_heat_all_origin = sum(max_heat_per_layer_before)
-
-        # 计算负载均衡，部署冗余专家
-        layer_num = layer_workloads.shape[0]
-        expert_num = layer_workloads.shape[1]
-        # 校验专家数量、卡数量、冗余专家数量不能超过卡数量
-        if num_original_expert != expert_num:
-            raise ValueError(f"原始专家数量 {num_original_expert} 必须等于 expert_num {expert_num}")
-
-        if num_npus <= 0:
-            raise ValueError("NPUs 数量必须大于 0")
-
-        if num_npus < num_redundancy_expert:
-            raise ValueError(f"NPUs 数量 {num_npus} 必须大于或等于冗余专家数量 {num_redundancy_expert}")
-
-        # 每个卡部署的专家数量 一个冗余专家
-        global_deployment = [[[] for _ in range(num_npus)] for _ in range(layer_num)]
-        # 遍历获得每一层的放置策略，考虑计算均衡
-        max_heat_per_layer_after = np.zeros([layer_num])
-        for layer in range(layer_num):
-            # 获取当前层专家ID和对应负载，负载需要进行正则化处理, 每个卡加一个冗余专家
-            weights = np.zeros((expert_num,), dtype='object')
-            for expert_id, workload_weight in enumerate(layer_workloads[layer]):
-                weights[expert_id] = (expert_id, workload_weight)
-
-            # 获取每一层全局计算均衡的放置策略
-            result, layer_deployment = self.original_compute_balanced_pack_redundancy(
-                weights, num_npus, num_redundancy_expert
-            )
-            global_deployment[layer] = layer_deployment
-            max_heat_per_layer_after[layer] = max(result, key=lambda x: x['total_weight'])['total_weight']
-
-        # 获取层优先级
-        layer_changed_ratio = []
-        for layer_idx in range(layer_num):
-            layer_changed_ratio.append(max_heat_per_layer_after[layer_idx] / max_heat_per_layer_before[layer_idx])
-
-        per_layer_priority = np.argsort(layer_changed_ratio)
-        npu_heat_all_after = sum(max_heat_per_layer_after)
-
-        change = 0
-
-        return change, per_layer_priority, np.array(global_deployment).tolist()
-
     @staticmethod
     def compute_redundant_assignments(base_experts, num_redundant_experts, num_experts):
         """
@@ -845,6 +790,25 @@ def rebalance_experts(self, current_expert_table, expert_workload):
                                                                                num_node, num_npus, False, ave_workload,
                                                                                0.05, num_redundancy_expert)
 
+            # To guarantee there is no expert movement inside a NPU
+            start_physical_idx = 1 if num_redundancy_expert else 0
+            for rank in range(num_npus):
+                physical_expert = start_physical_idx
+                while physical_expert in range(start_physical_idx, experts_per_npu):
+                    # skip the expert which is moved into this rank
+                    if global_deployment[layer][rank][physical_expert] not in current_expert_table[layer, rank, :]:
+                        physical_expert += 1
+                        continue
+
+                    if global_deployment[layer][rank][physical_expert] != current_expert_table[layer][rank][physical_expert]:
+                        right_idx = np.where(current_expert_table[layer][rank] == global_deployment[layer][rank][physical_expert])[0][0]
+                        # exchange expert with the expert on the right physical index
+                        tempt = global_deployment[layer][rank][right_idx]
+                        global_deployment[layer][rank][right_idx] = global_deployment[layer][rank][physical_expert]
+                        global_deployment[layer][rank][physical_expert] = tempt
+                    else:
+                        physical_expert += 1
+
             for device_id in range(num_npus):
                 com_between_devices[device_id] = {int(key): int(value) for key, value in
                                                   com_between_devices[device_id].items()}
diff --git a/vllm_ascend/eplb/core/policy/policy_factory.py b/vllm_ascend/eplb/core/policy/policy_factory.py
@@ -10,14 +10,18 @@ class PolicyFactory:
     @staticmethod
     def generate_policy(policy_type: int, config: DynamicConfig) -> EplbPolicy:
         policy = {
+            # Constraint applying Dynamic EPLB policy V2:
+            # If there exists redundant expert:
+            # only one redundant expert can be placed in one NPU and its physical expert index must be 0
+            
             # Applying bipartite d2d expert weight update composing
             0:MockLoadBalance,     # MockLoadBalance
             1:DynamicEplb,         # Dynamic EPLB policy
             2:DynamicEplbV2,       # Dynamic EPLB policy V2
 
             # Applying greedy d2d expert weight update composing
-            4:MockLoadBalance,  # MockLoadBalance
-            5:DynamicEplb,       # Dynamic EPLB policy
-            6:DynamicEplbV2,     # Dynamic EPLB policy
+            3:MockLoadBalance,   # MockLoadBalance
+            4:DynamicEplb,       # Dynamic EPLB policy
+            5:DynamicEplbV2,     # Dynamic EPLB policy V2
         }
         return policy.get(policy_type, MockLoadBalance)(config)
diff --git a/vllm_ascend/eplb/eplb_updator.py b/vllm_ascend/eplb/eplb_updator.py
@@ -16,6 +16,7 @@
 #
 import torch
 import torch.distributed as dist
+import vllm.envs as envs
 from multiprocessing import Queue, Manager
 
 from vllm.logger import logger
@@ -33,11 +34,17 @@ def set_adaptor(self, adaptor):
         self.num_moe_layers = self.adaptor.num_moe_layers
 
     def init_eplb(self, expert_map_path):
-
+        self.num_expert_load_gather = 10
         self.redundant_enable = (expert_map_path != None)
         self.num_iterations: torch.int64 = 130
         self.expert_map_path = expert_map_path
 
+        try:
+            if not envs.VLLM_ALLOW_EXPERT_LOAD_COLLECTING:
+                self.num_expert_load_gather = self.num_iterations
+        except Exception as e:
+                self.num_expert_load_gather = self.num_iterations
+
         self.weight_update_counter = 0
         self.expert_map_initialized = False
         self.update_in_flight = False
@@ -80,10 +87,9 @@ def init_eplb(self, expert_map_path):
 
     def get_update_iteration(self):
         self.cur_iterations = self.cur_iterations + 1
-        if not self.gate_eplb:
-            return self.cur_iterations % self.num_iterations == 0
-        else:
-            return self.cur_iterations == self.num_iterations
+        load_gather_iteration = self.cur_iterations % self.num_expert_load_gather == 0 if not self.gate_eplb else self.cur_iterations == self.num_iterations 
+        upate_iteration = self.cur_iterations % self.num_iterations == 0 if not self.gate_eplb else self.cur_iterations == self.num_iterations 
+        return load_gather_iteration, upate_iteration
 
     def get_init_expert_map(self):
         try:
@@ -125,12 +131,15 @@ def forward_before(self):
         self.eplb_loader.asyn_expert_weight_transfer(self.reqs)
 
     def forward_end(self):
-        if not self.update_in_flight and self.get_update_iteration():
-            moe_load = self.compute_and_set_moe_load()
-            self.wakeup_eplb_worker()
-            self.update_in_flight = True
-            self.wait_worker_iterations = 0
-            self.weight_loading = False
+        if not self.update_in_flight:
+            load_gather_iteration, update_iteration = self.get_update_iteration()
+            if load_gather_iteration:
+                self.moe_load = self.compute_and_set_moe_load()
+            if update_iteration:
+                self.wakeup_eplb_worker()
+                self.update_in_flight = True
+                self.wait_worker_iterations = 0
+                self.weight_loading = False
 
         if self.update_in_flight:
             self.wait_worker_iterations = self.wait_worker_iterations + 1
@@ -220,9 +229,27 @@ def unpack_update_batch(self, packed_update_info):
         return recovered
 
     def get_expert_load(self) -> str:
-        """todo 确认moe_load的值是什么类型"""
-        # return '{"a":"b"}' # mock
-        return self.shared_dict['moe_load']
+
+        # todo wjh 给到返回值
+        # return self.shared_dict['moe_load']
+        # mock json_str
+        experts_load = ('{\"expert_load\":['
+                        '{\"ip\":\"141.xxx.xxx.181\",'
+                        '\"node_0\":'
+                        '{\"card_0\":'
+                        '[{\"layer_4\":{\"expert_0\":3,\"expert_2\":1}},{\"layer_5\":{\"expert_0\":3,\"expert_2\":1}}],'
+                        '\"card_1\":[{\"layer_4\":{\"expert_1\":3,\"expert_3\":1},\"layer_5\":{\"expert_0\":3,\"'
+                        'expert_2\":1}}]}},{\"ip\":\"141.xxx.xxx.177\",\"node_0\":{\"card_0\":[{\"layer_4\":'
+                        '{\"expert_0\":3,\"expert_2\":1}},{\"layer_5\":{\"expert_0\":3,\"expert_2\":1}}],'
+                        '\"card_1\":[{\"layer_4\":{\"expert_1\":3,\"expert_3\":1}}]}}]}')
+        return experts_load
+
+    def update_expert_load_statistical_period(self, num_expert_load_gather: int, num_iterations: int):
+        logger.info(f" start update {self.num_expert_load_gather=}, {self.num_iterations}...")
+        self.num_expert_load_gather = num_expert_load_gather
+        self.num_iterations = num_iterations
+        logger.info(f" update {self.num_expert_load_gather=}, {self.num_iterations} success...")
+
 
     def shutdown(self):
         """
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
@@ -1589,6 +1589,9 @@ def profile_run(self) -> None:
     def do_get_expert_load(self) -> str:
         return self.eplb_updator.get_expert_load()
 
+    def do_update_expert_load_statistical_period(self, num_expert_load_gather: int, num_iterations: int):
+        return self.eplb_updator.update_expert_load_statistical_period(num_expert_load_gather, num_iterations)
+
     def eplb_warmup(self):
         #EPLB
         if self.dynamic_eplb and not self.is_eplb_warmuped:
diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py
@@ -214,6 +214,9 @@ def get_expert_load(self) -> str:
         moe_load = self.model_runner.do_get_expert_load()
         return moe_load
 
+    def update_expert_load_statistical_period(self, num_expert_load_gather: int, num_iterations: int):
+        self.model_runner.do_update_expert_load_statistical_period(num_expert_load_gather, num_iterations)
+
     def get_model(self) -> nn.Module:
         return self.model_runner.get_model()