File tree Expand file tree Collapse file tree 4 files changed +83
-0
lines changed Expand file tree Collapse file tree 4 files changed +83
-0
lines changed Original file line number Diff line number Diff line change
1
+ # Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
2
+ from abc import abstractmethod
3
+
4
+
5
+ class DynamicConfig :
6
+ placement_policy = None
7
+
8
+ max_transferred_expert_per_layer = 100
9
+ # 一台机器上,一层最多搬运多少专家
10
+
11
+ ep_worldsize = 64 # 整个集群上所有的专家分布在多少个die上
12
+ num_die_per_host = 8 # 每台机器上有几个die
13
+
14
+
15
+ class EplbPolicy :
16
+ def __init__ (self , config : DynamicConfig ):
17
+ self .config = config
18
+
19
+ @abstractmethod
20
+ def rebalance_experts (self , current_expert_table , expert_workload ):
21
+ """
22
+ 传入weight并返回相关限制条件下的专家复制和放置
23
+ INPUT:
24
+ current_expert_table: [layerId, rankId, expert_num_i]
25
+ expert_workload = expert_table[layer0][rankId][expert_num_i]
26
+
27
+ RETURNED: (res, expert_table)
28
+ res:
29
+ 1 -- table_changed
30
+ 0 -- not_changed
31
+
32
+ expert_table: [layerId, rankId, expert_num_i]
33
+ expert_num_i --- [0, MaxExpertPerRank]
34
+ expertID = expert_table[layer0][rankId][expert_num_i]
35
+ array_values:
36
+ [0, 1, 2, 3, 248]
37
+ [4, 5, 6, 7, 254]
38
+ [8, 9, 10, 11, 71]
39
+ ...
40
+ [252, 253, 254, 255, 0]
41
+ """
42
+ pass
Original file line number Diff line number Diff line change
1
+ # Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
2
+ import copy
3
+ import random
4
+
5
+ from .eplb_policy import EplbPolicy , DynamicConfig
6
+
7
+
8
+ class MockLoadBalance (EplbPolicy ):
9
+ def __init__ (self , config : DynamicConfig ):
10
+ super ().__init__ (config )
11
+
12
+ def rebalance_experts (self , current_expert_table , expert_workload ):
13
+ new_table = copy .deepcopy (current_expert_table )
14
+ num_layers = len (current_expert_table )
15
+ num_card = len (current_expert_table [0 ])
16
+
17
+ for i in range (num_layers ):
18
+ # 随机选两个卡
19
+ indices = random .sample (range (num_card ), 2 )
20
+
21
+ # 交换冗余专家
22
+ new_table [i ][indices [0 ]][- 1 ], new_table [i ][indices [1 ]][- 1 ] = (
23
+ new_table [i ][indices [1 ]][- 1 ],
24
+ new_table [i ][indices [0 ]][- 1 ]
25
+ )
26
+ return 1 , [- i for i in range (num_layers )], new_table
27
+
Original file line number Diff line number Diff line change
1
+ # Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
2
+ from .eplb_policy import EplbPolicy , DynamicConfig
3
+ from .mock_load_balance import MockLoadBalance
4
+ from .dynamic_ep import DynamicEP
5
+
6
+
7
+ class PolicyFactory :
8
+ @staticmethod
9
+ def generate_policy (policy_type : int , config : DynamicConfig ) -> EplbPolicy :
10
+ policy = {
11
+ 0 : MockLoadBalance ,
12
+ 1 : DynamicEP ,
13
+ }
14
+ return policy .get (policy_type , MockLoadBalance )(config )
You can’t perform that action at this time.
0 commit comments