ray-project
diff --git a/‎python/ray/_private/node.py
Lines changed: 7 additions & 8 deletions b/‎python/ray/_private/node.py
Lines changed: 7 additions & 8 deletions
diff --git a/‎python/ray/_private/usage/usage_lib.py
Lines changed: 8 additions & 8 deletions b/‎python/ray/_private/usage/usage_lib.py
Lines changed: 8 additions & 8 deletions
diff --git a/‎python/ray/_private/utils.py
Lines changed: 15 additions & 0 deletions b/‎python/ray/_private/utils.py
Lines changed: 15 additions & 0 deletions
diff --git a/‎python/ray/dashboard/dashboard.py
Lines changed: 21 additions & 1 deletion b/‎python/ray/dashboard/dashboard.py
Lines changed: 21 additions & 1 deletion
diff --git a/‎python/ray/dashboard/head.py
Lines changed: 134 additions & 24 deletions b/‎python/ray/dashboard/head.py
Lines changed: 134 additions & 24 deletions
@@ -26,7 +26,12 @@
 from ray._raylet import GcsClient, get_session_key_from_storage
 from ray._private.resource_spec import ResourceSpec
 from ray._private.services import serialize_config, get_address
-from ray._private.utils import open_log, try_to_create_directory, try_to_symlink
+from ray._private.utils import (
+    open_log,
+    try_to_create_directory,
+    try_to_symlink,
+    validate_socket_filepath,
+)
 
 # Logger for this module. It should be configured at the entry point
 # into the program using Ray. Ray configures it by default automatically
@@ -989,7 +994,6 @@ def _prepare_socket_file(self, socket_path: str, default_prefix: str):
             socket_path: the socket file to prepare.
         """
         result = socket_path
-        is_mac = sys.platform.startswith("darwin")
         if sys.platform == "win32":
             if socket_path is None:
                 result = f"tcp://{self._localhost}:{self._get_unused_port()}"
@@ -1001,12 +1005,7 @@ def _prepare_socket_file(self, socket_path: str, default_prefix: str):
             else:
                 try_to_create_directory(os.path.dirname(socket_path))
 
-            # Check socket path length to make sure it's short enough
-            maxlen = (104 if is_mac else 108) - 1  # sockaddr_un->sun_path
-            if len(result.split("://", 1)[-1].encode("utf-8")) > maxlen:
-                raise OSError(
-                    f"AF_UNIX path length cannot exceed {maxlen} bytes: {result!r}"
-                )
+            validate_socket_filepath(result.split("://", 1)[-1])
         return result
 
     def _get_cached_port(
 
@@ -548,7 +548,7 @@ def get_total_num_running_jobs_to_report(gcs_client) -> Optional[int]:
                 total_num_running_jobs += 1
         return total_num_running_jobs
     except Exception as e:
-        logger.info(f"Faile to query number of running jobs in the cluster: {e}")
+        logger.info(f"Failed to query number of running jobs in the cluster: {e}")
         return None
 
 
@@ -562,7 +562,7 @@ def get_total_num_nodes_to_report(gcs_client, timeout=None) -> Optional[int]:
                 total_num_nodes += 1
         return total_num_nodes
     except Exception as e:
-        logger.info(f"Faile to query number of nodes in the cluster: {e}")
+        logger.info(f"Failed to query number of nodes in the cluster: {e}")
         return None
 
 
@@ -593,24 +593,24 @@ def get_extra_usage_tags_to_report(gcs_client) -> Dict[str, str]:
                 k, v = kv.split("=")
                 extra_usage_tags[k] = v
         except Exception as e:
-            logger.info(f"Failed to parse extra usage tags env var. Error: {e}")
+            logger.info(f"Failed to parse extra usage tags env var: {e}")
 
     valid_tag_keys = [tag_key.lower() for tag_key in TagKey.keys()]
     try:
         keys = gcs_client.internal_kv_keys(
             usage_constant.EXTRA_USAGE_TAG_PREFIX.encode(),
             namespace=usage_constant.USAGE_STATS_NAMESPACE.encode(),
         )
-        for key in keys:
-            value = gcs_client.internal_kv_get(
-                key, namespace=usage_constant.USAGE_STATS_NAMESPACE.encode()
-            )
+        kv = gcs_client.internal_kv_multi_get(
+            keys, namespace=usage_constant.USAGE_STATS_NAMESPACE.encode()
+        )
+        for key, value in kv.items():
             key = key.decode("utf-8")
             key = key[len(usage_constant.EXTRA_USAGE_TAG_PREFIX) :]
             assert key in valid_tag_keys
             extra_usage_tags[key] = value.decode("utf-8")
     except Exception as e:
-        logger.info(f"Failed to get extra usage tags from kv store {e}")
+        logger.info(f"Failed to get extra usage tags from kv store: {e}")
     return extra_usage_tags
 
 
 
@@ -1925,3 +1925,18 @@ def get_current_node_cpu_model_name() -> Optional[str]:
     except Exception:
         logger.debug("Failed to get CPU model name", exc_info=True)
         return None
+
+
+def validate_socket_filepath(filepath: str):
+    """
+    Validate the provided filename is a valid Unix socket filename.
+    """
+    # Don't check for Windows as it doesn't support Unix sockets.
+    if sys.platform == "win32":
+        return
+    is_mac = sys.platform.startswith("darwin")
+    maxlen = (104 if is_mac else 108) - 1
+    if len(filepath.encode("utf-8")) > maxlen:
+        raise OSError(
+            f"validate_socket_filename failed: AF_UNIX path length cannot exceed {maxlen} bytes: {filepath}"
+        )
@@ -44,6 +44,11 @@ class Dashboard:
         serve_frontend: If configured, frontend HTML
             is not served from the dashboard.
         log_dir: Log directory of dashboard.
+        logging_level: The logging level (e.g. logging.INFO, logging.DEBUG)
+        logging_format: The format string for log messages
+        logging_filename: The name of the log file
+        logging_rotate_bytes: Max size in bytes before rotating log file
+        logging_rotate_backup_count: Number of backup files to keep when rotating
     """
 
     def __init__(
@@ -55,7 +60,12 @@ def __init__(
         cluster_id_hex: str,
         grpc_port: int,
         node_ip_address: str,
-        log_dir: str = None,
+        log_dir: str,
+        logging_level: int,
+        logging_format: str,
+        logging_filename: str,
+        logging_rotate_bytes: int,
+        logging_rotate_backup_count: int,
         temp_dir: str = None,
         session_dir: str = None,
         minimal: bool = False,
@@ -71,6 +81,11 @@ def __init__(
             node_ip_address=node_ip_address,
             grpc_port=grpc_port,
             log_dir=log_dir,
+            logging_level=logging_level,
+            logging_format=logging_format,
+            logging_filename=logging_filename,
+            logging_rotate_bytes=logging_rotate_bytes,
+            logging_rotate_backup_count=logging_rotate_backup_count,
             temp_dir=temp_dir,
             session_dir=session_dir,
             minimal=minimal,
@@ -236,6 +251,11 @@ async def run(self):
             grpc_port=args.grpc_port,
             node_ip_address=args.node_ip_address,
             log_dir=args.log_dir,
+            logging_level=args.logging_level,
+            logging_format=args.logging_format,
+            logging_filename=args.logging_filename,
+            logging_rotate_bytes=args.logging_rotate_bytes,
+            logging_rotate_backup_count=args.logging_rotate_backup_count,
             temp_dir=args.temp_dir,
             session_dir=args.session_dir,
             minimal=args.minimal,
 
@@ -2,8 +2,9 @@
 import logging
 from concurrent.futures import ThreadPoolExecutor
 from pathlib import Path
-from typing import Optional, Set
+from typing import Optional, Set, List, Tuple, TYPE_CHECKING
 
+import ray
 import ray.dashboard.consts as dashboard_consts
 import ray.dashboard.utils as dashboard_utils
 import ray.experimental.internal_kv as internal_kv
@@ -26,6 +27,8 @@
 except ImportError:
     prometheus_client = None
 
+if TYPE_CHECKING:
+    from ray.dashboard.subprocesses.handle import SubprocessModuleHandle
 
 logger = logging.getLogger(__name__)
 
@@ -70,6 +73,11 @@ def __init__(
         node_ip_address: str,
         grpc_port: int,
         log_dir: str,
+        logging_level: int,
+        logging_format: str,
+        logging_filename: str,
+        logging_rotate_bytes: int,
+        logging_rotate_backup_count: int,
         temp_dir: str,
         session_dir: str,
         minimal: bool,
@@ -83,6 +91,11 @@ def __init__(
             http_port_retries: The maximum retry to bind ports for the Http server.
             gcs_address: The GCS address in the {address}:{port} format.
             log_dir: The log directory. E.g., /tmp/session_latest/logs.
+            logging_level: The logging level (e.g. logging.INFO, logging.DEBUG)
+            logging_format: The format string for log messages
+            logging_filename: The name of the log file
+            logging_rotate_bytes: Max size in bytes before rotating log file
+            logging_rotate_backup_count: Number of backup files to keep when rotating
             temp_dir: The temp directory. E.g., /tmp.
             session_dir: The session directory. E.g., tmp/session_latest.
             minimal: Whether or not it will load the minimal modules.
@@ -117,6 +130,11 @@ def __init__(
         self.gcs_address = gcs_address
         self.cluster_id_hex = cluster_id_hex
         self.log_dir = log_dir
+        self.logging_level = logging_level
+        self.logging_format = logging_format
+        self.logging_filename = logging_filename
+        self.logging_rotate_bytes = logging_rotate_bytes
+        self.logging_rotate_backup_count = logging_rotate_backup_count
         self.temp_dir = temp_dir
         self.session_dir = session_dir
         self.session_name = Path(session_dir).name
@@ -137,7 +155,11 @@ def __init__(
         # be configured to expose APIs.
         self.http_server = None
 
-    async def _configure_http_server(self, modules):
+    async def _configure_http_server(
+        self,
+        dashboard_head_modules: List[DashboardHeadModule],
+        subprocess_module_handles: List["SubprocessModuleHandle"],
+    ):
         from ray.dashboard.http_server_head import HttpServerDashboardHead
 
         self.http_server = HttpServerDashboardHead(
@@ -149,7 +171,7 @@ async def _configure_http_server(self, modules):
             self.session_name,
             self.metrics,
         )
-        await self.http_server.run(modules)
+        await self.http_server.run(dashboard_head_modules, subprocess_module_handles)
 
     @property
     def http_session(self):
@@ -173,8 +195,41 @@ async def _gcs_check_alive(self):
         except Exception:
             logger.warning("Failed to check gcs aliveness, will retry", exc_info=True)
 
-    def _load_modules(self, modules_to_load: Optional[Set[str]] = None):
-        """Load dashboard head modules.
+    def _load_modules(
+        self, modules_to_load: Optional[Set[str]] = None
+    ) -> Tuple[List[DashboardHeadModule], List["SubprocessModuleHandle"]]:
+        """
+        If minimal, only load DashboardHeadModule.
+        If non-minimal, load both kinds of modules: DashboardHeadModule, SubprocessModule.
+
+        If modules_to_load is not None, only load the modules in the set.
+        """
+        dashboard_head_modules = self._load_dashboard_head_modules(modules_to_load)
+        subprocess_module_handles = self._load_subprocess_module_handles(
+            modules_to_load
+        )
+
+        all_names = {type(m).__name__ for m in dashboard_head_modules} | {
+            h.module_cls.__name__ for h in subprocess_module_handles
+        }
+        assert len(all_names) == len(dashboard_head_modules) + len(
+            subprocess_module_handles
+        ), "Duplicate module names. A module name can't be a DashboardHeadModule and a SubprocessModule at the same time."
+
+        # Verify modules are loaded as expected.
+        if modules_to_load is not None and all_names != modules_to_load:
+            assert False, (
+                f"Actual loaded modules {all_names}, doesn't match the requested modules "
+                f"to load, {modules_to_load}."
+            )
+
+        self._modules_loaded = True
+        return dashboard_head_modules, subprocess_module_handles
+
+    def _load_dashboard_head_modules(
+        self, modules_to_load: Optional[Set[str]] = None
+    ) -> List[DashboardHeadModule]:
+        """Load `DashboardHeadModule`s.
 
         Args:
             modules: A list of module names to load. By default (None),
@@ -198,27 +253,72 @@ def _load_modules(self, modules_to_load: Optional[Set[str]] = None):
         )
 
         # Select modules to load.
-        modules_to_load = modules_to_load or {m.__name__ for m in head_cls_list}
-        logger.info("Modules to load: %s", modules_to_load)
+        if modules_to_load is not None:
+            head_cls_list = [
+                cls for cls in head_cls_list if cls.__name__ in modules_to_load
+            ]
 
-        for cls in head_cls_list:
-            logger.info("Loading %s: %s", DashboardHeadModule.__name__, cls)
-            if cls.__name__ in modules_to_load:
-                c = cls(config)
-                modules.append(c)
+        logger.info(f"DashboardHeadModules to load: {modules_to_load}.")
 
-        # Verify modules are loaded as expected.
-        loaded_modules = {type(m).__name__ for m in modules}
-        if loaded_modules != modules_to_load:
-            assert False, (
-                "Actual loaded modules, {}, doesn't match the requested modules "
-                "to load, {}".format(loaded_modules, modules_to_load)
-            )
+        for cls in head_cls_list:
+            logger.info(f"Loading {DashboardHeadModule.__name__}: {cls}.")
+            c = cls(config)
+            modules.append(c)
 
-        self._modules_loaded = True
-        logger.info("Loaded %d modules. %s", len(modules), modules)
+        logger.info(f"Loaded {len(modules)} dashboard head modules: {modules}.")
         return modules
 
+    def _load_subprocess_module_handles(
+        self, modules_to_load: Optional[Set[str]] = None
+    ) -> List["SubprocessModuleHandle"]:
+        """
+        If minimal, return an empty list.
+        If non-minimal, load `SubprocessModule`s by creating Handles to them.
+
+        Args:
+            modules: A list of module names to load. By default (None),
+                it loads all modules.
+        """
+        if self.minimal:
+            logger.info("Subprocess modules not loaded in minimal mode.")
+            return []
+
+        from ray.dashboard.subprocesses.module import (
+            SubprocessModule,
+            SubprocessModuleConfig,
+        )
+        from ray.dashboard.subprocesses.handle import SubprocessModuleHandle
+
+        handles = []
+        subprocess_cls_list = dashboard_utils.get_all_modules(SubprocessModule)
+
+        loop = ray._common.utils.get_or_create_event_loop()
+        config = SubprocessModuleConfig(
+            cluster_id_hex=self.cluster_id_hex,
+            gcs_address=self.gcs_address,
+            logging_level=self.logging_level,
+            logging_format=self.logging_format,
+            log_dir=self.log_dir,
+            logging_filename=self.logging_filename,
+            logging_rotate_bytes=self.logging_rotate_bytes,
+            logging_rotate_backup_count=self.logging_rotate_backup_count,
+            socket_dir=str(Path(self.session_dir) / "sockets"),
+        )
+
+        # Select modules to load.
+        if modules_to_load is not None:
+            subprocess_cls_list = [
+                cls for cls in subprocess_cls_list if cls.__name__ in modules_to_load
+            ]
+
+        for cls in subprocess_cls_list:
+            logger.info(f"Loading {SubprocessModule.__name__}: {cls}.")
+            handle = SubprocessModuleHandle(loop, cls, config)
+            handles.append(handle)
+
+        logger.info(f"Loaded {len(handles)} subprocess modules: {handles}.")
+        return handles
+
     async def _setup_metrics(self, gcs_aio_client):
         metrics = DashboardPrometheusMetrics()
 
@@ -291,12 +391,22 @@ async def _async_notify():
                 except Exception:
                     logger.exception(f"Error notifying coroutine {co}")
 
-        modules = self._load_modules(self._modules_to_load)
+        dashboard_head_modules, subprocess_module_handles = self._load_modules(
+            self._modules_to_load
+        )
+        # Parallel start all subprocess modules.
+        for handle in subprocess_module_handles:
+            handle.start_module()
+        # Wait for all subprocess modules to be ready.
+        for handle in subprocess_module_handles:
+            handle.wait_for_module_ready()
 
         http_host, http_port = self.http_host, self.http_port
         if self.serve_frontend:
             logger.info("Initialize the http server.")
-            await self._configure_http_server(modules)
+            await self._configure_http_server(
+                dashboard_head_modules, subprocess_module_handles
+            )
             http_host, http_port = self.http_server.get_address()
             logger.info(f"http server initialized at {http_host}:{http_port}")
         else:
@@ -336,7 +446,7 @@ async def _async_notify():
             DataOrganizer.purge(),
             DataOrganizer.organize(self._executor),
         ]
-        for m in modules:
+        for m in dashboard_head_modules:
             concurrent_tasks.append(m.run(self.server))
         if self.server:
             concurrent_tasks.append(self.server.wait_for_termination())