13
13
from vllm .model_executor .utils import set_random_seed
14
14
from vllm .platforms import CpuArchEnum , current_platform
15
15
from vllm .sequence import IntermediateTensors
16
+ from vllm .utils import PlaceholderModule
16
17
from vllm .v1 .core .sched .output import SchedulerOutput
17
18
from vllm .v1 .outputs import ModelRunnerOutput
18
19
from vllm .v1 .worker .cpu_model_runner import CPUModelRunner
19
20
from vllm .v1 .worker .gpu_worker import (Worker ,
20
21
init_worker_distributed_environment )
21
22
23
+ try :
24
+ import psutil
25
+ from numa import info
26
+ except ImportError :
27
+ psutil = PlaceholderModule ("psutil" ) # type: ignore[assignment]
28
+ numa = PlaceholderModule ("numa" ) # type: ignore[assignment]
29
+
22
30
logger = init_logger (__name__ )
23
31
24
32
@@ -37,6 +45,8 @@ def __init__(self,
37
45
is_driver_worker = is_driver_worker )
38
46
39
47
self .parallel_config .disable_custom_all_reduce = True
48
+ self .manually_bind_threads_suggestion = (
49
+ "To get better performance, please try to manually bind threads." )
40
50
41
51
def init_device (self ):
42
52
# Setup OpenMP threads affinity.
@@ -112,50 +122,111 @@ def execute_model(
112
122
assert isinstance (output , ModelRunnerOutput )
113
123
return output if self .is_driver_worker else None
114
124
125
+ def warn_inability_to_detect_numa (self ) -> None :
126
+ logger .warning (
127
+ "Auto thread-binding failed due to the "
128
+ "inability to detect numa nodes. %s" ,
129
+ self .manually_bind_threads_suggestion )
130
+
131
+ def warn_lack_of_numa_and_psutil (self ) -> None :
132
+ logger .warning (
133
+ "Auto thread-binding failed due to "
134
+ "the lack of package numa and psutil. %s" ,
135
+ self .manually_bind_threads_suggestion )
136
+
137
+ def warn_world_size_too_large (self , world_size : int ,
138
+ node_to_cpus_len : int ) -> None :
139
+ logger .warning (
140
+ "Auto thread-binding failed due to "
141
+ "world size: %d being larger than "
142
+ "allowed NUMA nodes number: %d. %s" , world_size , node_to_cpus_len ,
143
+ self .manually_bind_threads_suggestion )
144
+
145
+ def get_cpus_allow_list_and_numa_size (self ):
146
+ cpus_allow_list = psutil .Process ().cpu_affinity ()
147
+ numa_size = info .get_num_configured_nodes ()
148
+ return cpus_allow_list , numa_size
149
+
150
+ def auto_thread_binding_based_on_numa_nodes (self , world_size : int ,
151
+ rank_to_cpus : str ) -> str :
152
+ cpu_count = psutil .cpu_count (logical = False )
153
+ cpus_allow_list , numa_size = self .get_cpus_allow_list_and_numa_size ()
154
+ if not numa_size :
155
+ self .warn_inability_to_detect_numa ()
156
+ return rank_to_cpus
157
+
158
+ cpu_count_per_numa = cpu_count // numa_size
159
+ num_of_reserved_cpu = min (envs .VLLM_CPU_NUM_OF_RESERVED_CPU ,
160
+ cpu_count_per_numa // 2 )
161
+
162
+ node_to_cpus = []
163
+ for i in range (numa_size ):
164
+ node_intersect = set (
165
+ info .node_to_cpus (i )).intersection (cpus_allow_list )
166
+ if bool (node_intersect ):
167
+ node_to_cpus .append (list (node_intersect ))
168
+
169
+ node_to_cpus_len = len (node_to_cpus )
170
+ if world_size > node_to_cpus_len :
171
+ self .warn_world_size_too_large (world_size , node_to_cpus_len )
172
+ else :
173
+ end = cpu_count_per_numa - num_of_reserved_cpu
174
+ rank_to_cpus_list = node_to_cpus [self .rank ][:end ]
175
+ rank_to_cpus = ',' .join (str (x ) for x in rank_to_cpus_list )
176
+ logger .info ("auto thread-binding list: %s" , rank_to_cpus )
177
+ return rank_to_cpus
178
+
179
+ def libnuma_and_psutil_found (self ) -> bool :
180
+ libnuma_found = util .find_spec ("numa" ) is not None
181
+ psutil_found = util .find_spec ("psutil" ) is not None
182
+
183
+ return libnuma_found and psutil_found
184
+
115
185
def get_cpus_id_binding_based_on_numa_nodes (self ) -> str :
116
186
"""Return CPUs id binding based on NUMA nodes.
117
187
"""
118
188
rank_to_cpus = self .local_omp_cpuid
119
189
# Setup OpenMP thread affinity based on NUMA nodes automatically
120
190
world_size = self .vllm_config .parallel_config .world_size
121
- libnuma_found = util .find_spec ("numa" ) is not None
122
- psutil_found = util .find_spec ("psutil" ) is not None
123
- if libnuma_found and psutil_found :
124
- import psutil
125
- from numa import info
126
- cpu_count = psutil .cpu_count (logical = False )
127
- cpus_allow_list = psutil .Process ().cpu_affinity ()
128
- numa_size = info .get_num_configured_nodes ()
129
- cpu_count_per_numa = cpu_count // numa_size
130
- num_of_reserved_cpu = min (envs .VLLM_CPU_NUM_OF_RESERVED_CPU ,
131
- cpu_count_per_numa // 2 )
191
+ if self .libnuma_and_psutil_found ():
192
+ rank_to_cpus = self .auto_thread_binding_based_on_numa_nodes (
193
+ world_size , rank_to_cpus )
194
+ else :
195
+ self .warn_lack_of_numa_and_psutil ()
196
+ return rank_to_cpus
132
197
133
- # check allow node_to_cpus list
134
- node_to_cpus = []
135
- for i in range (numa_size ):
136
- node_intersect = set (
137
- info .node_to_cpus (i )).intersection (cpus_allow_list )
138
- if bool (node_intersect ):
139
- node_to_cpus .append (list (node_intersect ))
140
-
141
- if world_size > len (node_to_cpus ):
142
- logger .error (
143
- "Auto thread-binding failed due to "
144
- "world size: %d is larger than "
145
- "allowed NUMA nodes number: %d."
146
- "Please try to bind threads manually." , world_size ,
147
- len (node_to_cpus ))
148
- else :
149
- end = cpu_count_per_numa - num_of_reserved_cpu
150
- rank_to_cpus_list = node_to_cpus [self .rank ][:end ]
151
- rank_to_cpus = ',' .join (str (x ) for x in rank_to_cpus_list )
152
- logger .info ("auto thread-binding list: %s" , rank_to_cpus )
198
+ def select_threads_per_power_core (self ,
199
+ node_cpu_ids : list [int ]) -> list [int ]:
200
+ return [cpu for cpu in node_cpu_ids if cpu % 8 < 4 ]
201
+
202
+ def auto_thread_binding_based_on_numa_nodes_ppc64le (
203
+ self , world_size : int , rank_to_cpus : str ) -> str :
204
+ cpus_allow_list , numa_size = self .get_cpus_allow_list_and_numa_size ()
205
+ if not numa_size :
206
+ self .warn_inability_to_detect_numa ()
207
+ return rank_to_cpus
208
+
209
+ node_to_cpus = []
210
+ for i in range (numa_size ):
211
+ node_intersect = set (
212
+ info .node_to_cpus (i )).intersection (cpus_allow_list )
213
+ if bool (node_intersect ):
214
+ node_to_cpus .append (sorted (list (node_intersect )))
215
+
216
+ node_to_cpus_len = len (node_to_cpus )
217
+ if world_size > node_to_cpus_len :
218
+ self .warn_world_size_too_large (world_size , node_to_cpus_len )
153
219
else :
154
- logger .warning (
155
- "Auto thread-binding is not supported due to "
156
- "the lack of package numa and psutil,"
157
- "fallback to no thread-binding. To get better performance,"
158
- "please try to manually bind threads." )
220
+ node_cpus_this_rank = node_to_cpus [self .rank ]
221
+ node_cpus_this_rank = self .select_threads_per_power_core (
222
+ node_cpus_this_rank )
223
+ cpu_count_per_numa = len (node_cpus_this_rank )
224
+ num_of_reserved_cpu = min (envs .VLLM_CPU_NUM_OF_RESERVED_CPU ,
225
+ cpu_count_per_numa // 2 )
226
+ end = cpu_count_per_numa - num_of_reserved_cpu
227
+ rank_to_cpus_list = node_cpus_this_rank [:end ]
228
+ rank_to_cpus = ',' .join (str (x ) for x in rank_to_cpus_list )
229
+ logger .info ("ppc64le thread-binding list: %s" , rank_to_cpus )
159
230
return rank_to_cpus
160
231
161
232
def get_cpus_id_binding_based_on_numa_nodes_ppc64le (self ) -> str :
@@ -166,48 +237,11 @@ def get_cpus_id_binding_based_on_numa_nodes_ppc64le(self) -> str:
166
237
performance by avoiding oversubscription of logical CPUs on Power.
167
238
"""
168
239
169
- def select_threads_per_power_core (node_cpu_ids ):
170
- return [cpu for cpu in node_cpu_ids if cpu % 8 < 4 ]
171
-
172
240
rank_to_cpus = self .local_omp_cpuid
173
241
world_size = self .vllm_config .parallel_config .world_size
174
- libnuma_found = util .find_spec ("numa" ) is not None
175
- psutil_found = util .find_spec ("psutil" ) is not None
176
- if libnuma_found and psutil_found :
177
- import psutil
178
- from numa import info
179
- cpus_allow_list = psutil .Process ().cpu_affinity ()
180
- numa_size = info .get_num_configured_nodes ()
181
-
182
- node_to_cpus = []
183
- for i in range (numa_size ):
184
- node_intersect = set (
185
- info .node_to_cpus (i )).intersection (cpus_allow_list )
186
- if bool (node_intersect ):
187
- node_to_cpus .append (sorted (list (node_intersect )))
188
-
189
- if world_size > len (node_to_cpus ):
190
- logger .error (
191
- "Auto thread-binding failed due to "
192
- "world size: %d is larger than "
193
- "allowed NUMA nodes number: %d."
194
- "Please try to bind threads manually." , world_size ,
195
- len (node_to_cpus ))
196
- else :
197
- node_cpus_this_rank = node_to_cpus [self .rank ]
198
- node_cpus_this_rank = select_threads_per_power_core (
199
- node_cpus_this_rank )
200
- cpu_count_per_numa = len (node_cpus_this_rank )
201
- num_of_reserved_cpu = min (envs .VLLM_CPU_NUM_OF_RESERVED_CPU ,
202
- cpu_count_per_numa // 2 )
203
- end = cpu_count_per_numa - num_of_reserved_cpu
204
- rank_to_cpus_list = node_cpus_this_rank [:end ]
205
- rank_to_cpus = ',' .join (str (x ) for x in rank_to_cpus_list )
206
- logger .info ("ppc64le thread-binding list: %s" , rank_to_cpus )
242
+ if self .libnuma_and_psutil_found ():
243
+ rank_to_cpus = self .auto_thread_binding_based_on_numa_nodes_ppc64le (
244
+ world_size , rank_to_cpus )
207
245
else :
208
- logger .warning (
209
- "Auto thread-binding is not supported due to "
210
- "the lack of package numa and psutil,"
211
- "fallback to no thread-binding. To get better performance,"
212
- "please try to manually bind threads." )
246
+ self .warn_lack_of_numa_and_psutil ()
213
247
return rank_to_cpus
0 commit comments