[Feat][Core/Dashboard] Convert StateHead to subprocess module (#51676)

MortalHappiness · web-flow · commit 34ae461121a8 · 2025-03-26T08:41:30.000-07:00
Signed-off-by: Chi-Sheng Liu &lt;chishengliu@chishengliu.com&gt;
diff --git a/python/ray/dashboard/modules/state/state_head.py b/python/ray/dashboard/modules/state/state_head.py
@@ -8,8 +8,7 @@
 import aiohttp.web
 from aiohttp.web import Response
 
-import ray.dashboard.optional_utils as dashboard_optional_utils
-import ray.dashboard.utils as dashboard_utils
+import ray
 from ray import ActorID
 from ray._private.ray_constants import env_integer
 from ray._private.usage.usage_lib import TagKey, record_extra_usage_tag
@@ -28,12 +27,14 @@
     options_from_req,
 )
 from ray.dashboard.utils import RateLimitedModule
+from ray.dashboard.subprocesses.routes import SubprocessRouteTable as routes
+from ray.dashboard.subprocesses.module import SubprocessModule
+from ray.dashboard.subprocesses.utils import ResponseType
 from ray.util.state.common import DEFAULT_LOG_LIMIT, DEFAULT_RPC_TIMEOUT, GetLogOptions
 from ray.util.state.exception import DataSourceUnavailable
 from ray.util.state.state_manager import StateDataSourceClient
 
 logger = logging.getLogger(__name__)
-routes = dashboard_optional_utils.DashboardHeadRouteTable
 
 # NOTE: Executor in this head is intentionally constrained to just 1 thread by
 #       default to limit its concurrency, therefore reducing potential for
@@ -43,19 +44,16 @@
 )
 
 
-class StateHead(dashboard_utils.DashboardHeadModule, RateLimitedModule):
+class StateHead(SubprocessModule, RateLimitedModule):
     """Module to obtain state information from the Ray cluster.
 
     It is responsible for state observability APIs such as
     ray.list_actors(), ray.get_actor(), ray.summary_actors().
     """
 
-    def __init__(
-        self,
-        config: dashboard_utils.DashboardHeadModuleConfig,
-    ):
+    def __init__(self, *args, **kwargs):
         """Initialize for handling RESTful requests from State API Client"""
-        dashboard_utils.DashboardHeadModule.__init__(self, config)
+        SubprocessModule.__init__(self, *args, **kwargs)
         # We don't allow users to configure too high a rate limit
         RateLimitedModule.__init__(
             self,
@@ -73,6 +71,10 @@ def __init__(
             thread_name_prefix="state_head_executor",
         )
 
+        # To make sure that the internal KV is initialized by getting the lazy property
+        assert self.gcs_client is not None
+        assert ray.experimental.internal_kv._internal_kv_initialized()
+
     async def limit_handler_(self):
         return do_reply(
             success=False,
@@ -191,7 +193,7 @@ async def list_logs(self, req: aiohttp.web.Request) -> aiohttp.web.Response:
 
         return do_reply(success=True, error_message="", result=result)
 
-    @routes.get("/api/v0/logs/{media_type}")
+    @routes.get("/api/v0/logs/{media_type}", resp_type=ResponseType.STREAM)
     @RateLimitedModule.enforce_max_concurrent_calls
     async def get_logs(self, req: aiohttp.web.Request):
         """
@@ -333,7 +335,8 @@ async def delayed_response(self, req: aiohttp.web.Request):
             partial_failure_warning=None,
         )
 
-    async def run(self, server):
+    async def run(self):
+        await SubprocessModule.run(self)
         gcs_channel = self.aiogrpc_gcs_channel
         self._state_api_data_source_client = StateDataSourceClient(
             gcs_channel, self.gcs_aio_client
@@ -343,7 +346,3 @@ async def run(self, server):
             self._executor,
         )
         self._log_api = LogsManager(self._state_api_data_source_client)
-
-    @staticmethod
-    def is_minimal_module():
-        return False
diff --git a/python/ray/dashboard/subprocesses/module.py b/python/ray/dashboard/subprocesses/module.py
@@ -12,7 +12,7 @@
 import ray
 from ray import ray_constants
 from ray._raylet import GcsClient
-from ray._private.gcs_utils import GcsAioClient
+from ray._private.gcs_utils import GcsAioClient, GcsChannel
 from ray._private.ray_logging import configure_log_file
 from ray._private.utils import open_log
 from ray.dashboard.subprocesses.utils import (
@@ -68,6 +68,7 @@ def __init__(
         # Lazy init
         self._gcs_client = None
         self._gcs_aio_client = None
+        self._aiogrpc_gcs_channel = None
         self._parent_process_death_detection_task = None
         self._http_session = None
 
@@ -159,6 +160,14 @@ def gcs_client(self):
             self._gcs_client = ray.experimental.internal_kv.internal_kv_get_gcs_client()
         return self._gcs_client
 
+    @property
+    def aiogrpc_gcs_channel(self):
+        if self._aiogrpc_gcs_channel is None:
+            gcs_channel = GcsChannel(gcs_address=self._config.gcs_address, aio=True)
+            gcs_channel.connect()
+            self._aiogrpc_gcs_channel = gcs_channel.channel()
+        return self._aiogrpc_gcs_channel
+
     @property
     def session_name(self):
         """
diff --git a/python/ray/dashboard/subprocesses/tests/utils.py b/python/ray/dashboard/subprocesses/tests/utils.py
@@ -24,6 +24,10 @@ def gcs_aio_client(self):
     def gcs_client(self):
         return None
 
+    @property
+    def aiogrpc_gcs_channel(self):
+        return None
+
 
 class TestModule(BaseTestModule):
     def __init__(self, *args, **kwargs):
diff --git a/python/ray/tests/test_state_api_log.py b/python/ray/tests/test_state_api_log.py
@@ -999,9 +999,7 @@ def verify():
     with pytest.raises(requests.HTTPError) as e:
         list_logs(node_id=node_id)
 
-    assert (
-        f"Agent for node id: {node_id} doesn't exist." in e.value.response.json()["msg"]
-    )
+    assert e.value.response.status_code == 500
 
 
 @pytest.mark.skipif(