Merge branch 'br_main_into_eplb' of https://github.com/raindaywhu/vllm-ascend into br_main_into_eplb

lt · lt · commit 2dba24d21341 · 2025-06-23T09:04:24.000+08:00
diff --git a/vllm_ascend/eplb/adaptor/vllm_adaptor.py b/vllm_ascend/eplb/adaptor/vllm_adaptor.py
@@ -85,25 +85,36 @@ def init_expert_param_per_layer(self):
     def get_rank_expert_workload(
         self,
         num_moe_layers: int,
+        dummy_run = False
     ) -> torch.Tensor:
-        # 收集各层 topk_ids -> list of [B, K]
+
         all_topk_ids = [self.model.get_topk_ids(i) for i in range(num_moe_layers)]
-        # stack & flatten -> ids2d: [L, B*K]
-        stacked = torch.stack(all_topk_ids, dim=0)          # [L, B, K]
+        stacked = torch.stack(all_topk_ids, dim=0)      
         L, B, K = stacked.shape
-        ids2d   = stacked.view(L, B * K).to(torch.int64)   # [L, N]
+        N = B * K
+        device = stacked.device
+        G = self.global_expert_num
+
+        if not hasattr(self, "cum_moe_load") or self.cum_moe_load is None:
+            self.cum_moe_load = torch.zeros((L, G),
+                                            dtype=torch.int64,
+                                            device=device)
+
+        if dummy_run:
+            return self.cum_moe_load
+
+        ids1d = stacked.view(-1).to(torch.int64)       
 
-        device   = ids2d.device
-        moe_load = torch.zeros((L, self.global_expert_num),
-                            dtype=torch.int64, device=device)
+        row_idx = torch.arange(L, device=device).repeat_interleave(N) 
 
-        ones2d = torch.ones_like(ids2d, dtype=torch.int64)
+        combined = row_idx * G + ids1d                     
 
-        assert moe_load.dim() == 2 and ids2d.dim() == 2 and ones2d.dim() == 2
-        assert ids2d.shape == ones2d.shape
+        counts = torch.bincount(combined, minlength=L * G)  
+        workload = counts.view(L, G)                       
 
-        moe_load.scatter_add_(dim=1, index=ids2d, src=ones2d)
-        return moe_load
+        self.cum_moe_load.add_(workload)
+    
+        return self.cum_moe_load
 
     def get_init_expert_map(self, num_moe_layers):
         expert_map = self.model.get_all_expert_map(num_moe_layers)
diff --git a/vllm_ascend/eplb/eplb_updator.py b/vllm_ascend/eplb/eplb_updator.py
@@ -130,11 +130,12 @@ def forward_before(self):
         self.reqs = []
         self.eplb_loader.asyn_expert_weight_transfer(self.reqs)
 
-    def forward_end(self):
+def forward_end(self,dummy_run=False):
+        self.adaptor.get_rank_expert_workload(self.num_moe_layers,dummy_run)
         if not self.update_in_flight:
             load_gather_iteration, update_iteration = self.get_update_iteration()
             if load_gather_iteration:
-                self.moe_load = self.compute_and_set_moe_load()
+                moe_load = self.compute_and_set_moe_load(dummy_run)
             if update_iteration:
                 self.wakeup_eplb_worker()
                 self.update_in_flight = True
@@ -146,9 +147,8 @@ def forward_end(self):
 
         self.eplb_loader.update_expert_map_and_weight(self.reqs, self.redundant_enable)
 
-    def compute_and_set_moe_load(self):
-        local_load = self.adaptor.get_rank_expert_workload(self.num_moe_layers)
-
+    def compute_and_set_moe_load(self,dummy_run=False):
+        local_load = self.adaptor.get_rank_expert_workload(self.num_moe_layers,dummy_run)
         self._gather_buffer = None
         if dist.is_initialized():
             self.world_size = dist.get_world_size()
@@ -161,7 +161,7 @@ def compute_and_set_moe_load(self):
 
             dist.all_gather_into_tensor(self._gather_buffer, local_load)
 
-            moe_load = self._gather_buffer.permute(1, 0, 2).contiguous()
+            moe_load = self._gather_buffer.permute(1, 0, 2)
             self.shared_dict["moe_load"] = moe_load.cpu()
             logger.debug(f"[ModelRunner] Updated shared_dict['moe_load'] shape={moe_load.shape}")
         else:
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
@@ -1544,7 +1544,8 @@ def _dummy_run(
                         inputs_embeds=inputs_embeds)
 
                 if not is_compile and not is_profile_run and self.dynamic_eplb:
-                    self.eplb_updator.forward_end()
+                    dummy_run = True
+                    self.eplb_updator.forward_end(dummy_run)
                 return hidden_states
 
     def profile_run(self) -> None: