2
2
import logging
3
3
from concurrent .futures import ThreadPoolExecutor
4
4
from pathlib import Path
5
- from typing import Optional , Set
5
+ from typing import Optional , Set , List , Tuple , TYPE_CHECKING
6
6
7
+ import ray
7
8
import ray .dashboard .consts as dashboard_consts
8
9
import ray .dashboard .utils as dashboard_utils
9
10
import ray .experimental .internal_kv as internal_kv
26
27
except ImportError :
27
28
prometheus_client = None
28
29
30
+ if TYPE_CHECKING :
31
+ from ray .dashboard .subprocesses .handle import SubprocessModuleHandle
29
32
30
33
logger = logging .getLogger (__name__ )
31
34
@@ -70,6 +73,11 @@ def __init__(
70
73
node_ip_address : str ,
71
74
grpc_port : int ,
72
75
log_dir : str ,
76
+ logging_level : int ,
77
+ logging_format : str ,
78
+ logging_filename : str ,
79
+ logging_rotate_bytes : int ,
80
+ logging_rotate_backup_count : int ,
73
81
temp_dir : str ,
74
82
session_dir : str ,
75
83
minimal : bool ,
@@ -83,6 +91,11 @@ def __init__(
83
91
http_port_retries: The maximum retry to bind ports for the Http server.
84
92
gcs_address: The GCS address in the {address}:{port} format.
85
93
log_dir: The log directory. E.g., /tmp/session_latest/logs.
94
+ logging_level: The logging level (e.g. logging.INFO, logging.DEBUG)
95
+ logging_format: The format string for log messages
96
+ logging_filename: The name of the log file
97
+ logging_rotate_bytes: Max size in bytes before rotating log file
98
+ logging_rotate_backup_count: Number of backup files to keep when rotating
86
99
temp_dir: The temp directory. E.g., /tmp.
87
100
session_dir: The session directory. E.g., tmp/session_latest.
88
101
minimal: Whether or not it will load the minimal modules.
@@ -117,6 +130,11 @@ def __init__(
117
130
self .gcs_address = gcs_address
118
131
self .cluster_id_hex = cluster_id_hex
119
132
self .log_dir = log_dir
133
+ self .logging_level = logging_level
134
+ self .logging_format = logging_format
135
+ self .logging_filename = logging_filename
136
+ self .logging_rotate_bytes = logging_rotate_bytes
137
+ self .logging_rotate_backup_count = logging_rotate_backup_count
120
138
self .temp_dir = temp_dir
121
139
self .session_dir = session_dir
122
140
self .session_name = Path (session_dir ).name
@@ -137,7 +155,11 @@ def __init__(
137
155
# be configured to expose APIs.
138
156
self .http_server = None
139
157
140
- async def _configure_http_server (self , modules ):
158
+ async def _configure_http_server (
159
+ self ,
160
+ dashboard_head_modules : List [DashboardHeadModule ],
161
+ subprocess_module_handles : List ["SubprocessModuleHandle" ],
162
+ ):
141
163
from ray .dashboard .http_server_head import HttpServerDashboardHead
142
164
143
165
self .http_server = HttpServerDashboardHead (
@@ -149,7 +171,7 @@ async def _configure_http_server(self, modules):
149
171
self .session_name ,
150
172
self .metrics ,
151
173
)
152
- await self .http_server .run (modules )
174
+ await self .http_server .run (dashboard_head_modules , subprocess_module_handles )
153
175
154
176
@property
155
177
def http_session (self ):
@@ -173,8 +195,41 @@ async def _gcs_check_alive(self):
173
195
except Exception :
174
196
logger .warning ("Failed to check gcs aliveness, will retry" , exc_info = True )
175
197
176
- def _load_modules (self , modules_to_load : Optional [Set [str ]] = None ):
177
- """Load dashboard head modules.
198
+ def _load_modules (
199
+ self , modules_to_load : Optional [Set [str ]] = None
200
+ ) -> Tuple [List [DashboardHeadModule ], List ["SubprocessModuleHandle" ]]:
201
+ """
202
+ If minimal, only load DashboardHeadModule.
203
+ If non-minimal, load both kinds of modules: DashboardHeadModule, SubprocessModule.
204
+
205
+ If modules_to_load is not None, only load the modules in the set.
206
+ """
207
+ dashboard_head_modules = self ._load_dashboard_head_modules (modules_to_load )
208
+ subprocess_module_handles = self ._load_subprocess_module_handles (
209
+ modules_to_load
210
+ )
211
+
212
+ all_names = {type (m ).__name__ for m in dashboard_head_modules } | {
213
+ h .module_cls .__name__ for h in subprocess_module_handles
214
+ }
215
+ assert len (all_names ) == len (dashboard_head_modules ) + len (
216
+ subprocess_module_handles
217
+ ), "Duplicate module names. A module name can't be a DashboardHeadModule and a SubprocessModule at the same time."
218
+
219
+ # Verify modules are loaded as expected.
220
+ if modules_to_load is not None and all_names != modules_to_load :
221
+ assert False , (
222
+ f"Actual loaded modules { all_names } , doesn't match the requested modules "
223
+ f"to load, { modules_to_load } ."
224
+ )
225
+
226
+ self ._modules_loaded = True
227
+ return dashboard_head_modules , subprocess_module_handles
228
+
229
+ def _load_dashboard_head_modules (
230
+ self , modules_to_load : Optional [Set [str ]] = None
231
+ ) -> List [DashboardHeadModule ]:
232
+ """Load `DashboardHeadModule`s.
178
233
179
234
Args:
180
235
modules: A list of module names to load. By default (None),
@@ -198,27 +253,72 @@ def _load_modules(self, modules_to_load: Optional[Set[str]] = None):
198
253
)
199
254
200
255
# Select modules to load.
201
- modules_to_load = modules_to_load or {m .__name__ for m in head_cls_list }
202
- logger .info ("Modules to load: %s" , modules_to_load )
256
+ if modules_to_load is not None :
257
+ head_cls_list = [
258
+ cls for cls in head_cls_list if cls .__name__ in modules_to_load
259
+ ]
203
260
204
- for cls in head_cls_list :
205
- logger .info ("Loading %s: %s" , DashboardHeadModule .__name__ , cls )
206
- if cls .__name__ in modules_to_load :
207
- c = cls (config )
208
- modules .append (c )
261
+ logger .info (f"DashboardHeadModules to load: { modules_to_load } ." )
209
262
210
- # Verify modules are loaded as expected.
211
- loaded_modules = {type (m ).__name__ for m in modules }
212
- if loaded_modules != modules_to_load :
213
- assert False , (
214
- "Actual loaded modules, {}, doesn't match the requested modules "
215
- "to load, {}" .format (loaded_modules , modules_to_load )
216
- )
263
+ for cls in head_cls_list :
264
+ logger .info (f"Loading { DashboardHeadModule .__name__ } : { cls } ." )
265
+ c = cls (config )
266
+ modules .append (c )
217
267
218
- self ._modules_loaded = True
219
- logger .info ("Loaded %d modules. %s" , len (modules ), modules )
268
+ logger .info (f"Loaded { len (modules )} dashboard head modules: { modules } ." )
220
269
return modules
221
270
271
+ def _load_subprocess_module_handles (
272
+ self , modules_to_load : Optional [Set [str ]] = None
273
+ ) -> List ["SubprocessModuleHandle" ]:
274
+ """
275
+ If minimal, return an empty list.
276
+ If non-minimal, load `SubprocessModule`s by creating Handles to them.
277
+
278
+ Args:
279
+ modules: A list of module names to load. By default (None),
280
+ it loads all modules.
281
+ """
282
+ if self .minimal :
283
+ logger .info ("Subprocess modules not loaded in minimal mode." )
284
+ return []
285
+
286
+ from ray .dashboard .subprocesses .module import (
287
+ SubprocessModule ,
288
+ SubprocessModuleConfig ,
289
+ )
290
+ from ray .dashboard .subprocesses .handle import SubprocessModuleHandle
291
+
292
+ handles = []
293
+ subprocess_cls_list = dashboard_utils .get_all_modules (SubprocessModule )
294
+
295
+ loop = ray ._common .utils .get_or_create_event_loop ()
296
+ config = SubprocessModuleConfig (
297
+ cluster_id_hex = self .cluster_id_hex ,
298
+ gcs_address = self .gcs_address ,
299
+ logging_level = self .logging_level ,
300
+ logging_format = self .logging_format ,
301
+ log_dir = self .log_dir ,
302
+ logging_filename = self .logging_filename ,
303
+ logging_rotate_bytes = self .logging_rotate_bytes ,
304
+ logging_rotate_backup_count = self .logging_rotate_backup_count ,
305
+ socket_dir = str (Path (self .session_dir ) / "sockets" ),
306
+ )
307
+
308
+ # Select modules to load.
309
+ if modules_to_load is not None :
310
+ subprocess_cls_list = [
311
+ cls for cls in subprocess_cls_list if cls .__name__ in modules_to_load
312
+ ]
313
+
314
+ for cls in subprocess_cls_list :
315
+ logger .info (f"Loading { SubprocessModule .__name__ } : { cls } ." )
316
+ handle = SubprocessModuleHandle (loop , cls , config )
317
+ handles .append (handle )
318
+
319
+ logger .info (f"Loaded { len (handles )} subprocess modules: { handles } ." )
320
+ return handles
321
+
222
322
async def _setup_metrics (self , gcs_aio_client ):
223
323
metrics = DashboardPrometheusMetrics ()
224
324
@@ -291,12 +391,22 @@ async def _async_notify():
291
391
except Exception :
292
392
logger .exception (f"Error notifying coroutine { co } " )
293
393
294
- modules = self ._load_modules (self ._modules_to_load )
394
+ dashboard_head_modules , subprocess_module_handles = self ._load_modules (
395
+ self ._modules_to_load
396
+ )
397
+ # Parallel start all subprocess modules.
398
+ for handle in subprocess_module_handles :
399
+ handle .start_module ()
400
+ # Wait for all subprocess modules to be ready.
401
+ for handle in subprocess_module_handles :
402
+ handle .wait_for_module_ready ()
295
403
296
404
http_host , http_port = self .http_host , self .http_port
297
405
if self .serve_frontend :
298
406
logger .info ("Initialize the http server." )
299
- await self ._configure_http_server (modules )
407
+ await self ._configure_http_server (
408
+ dashboard_head_modules , subprocess_module_handles
409
+ )
300
410
http_host , http_port = self .http_server .get_address ()
301
411
logger .info (f"http server initialized at { http_host } :{ http_port } " )
302
412
else :
@@ -336,7 +446,7 @@ async def _async_notify():
336
446
DataOrganizer .purge (),
337
447
DataOrganizer .organize (self ._executor ),
338
448
]
339
- for m in modules :
449
+ for m in dashboard_head_modules :
340
450
concurrent_tasks .append (m .run (self .server ))
341
451
if self .server :
342
452
concurrent_tasks .append (self .server .wait_for_termination ())
0 commit comments