[train] train v1 export api (#51177)

matthewdeng · web-flow · commit 31878c97629a · 2025-03-13T17:32:54.000-07:00
This PR implements the export API for Ray Train V1 state. This builds on top of #50622, which implements the export API for Ray Train V2. ## Key Changes - Added `export.py` with conversion functions between Train V1 state models and Train (V2) state export protobuf - Updated `TrainRunInfo` and `TrainWorkerInfo` schemas with additional fields for compatibility: - Log file paths for controller and workers - Note that these point to the Ray worker stderr logs, rather than specific train logs. - Resource allocation information - Made worker status a required field - Note that it is always set as ACTIVE for now. Signed-off-by: Matthew Deng <matt@anyscale.com>
diff --git a/python/ray/train/BUILD b/python/ray/train/BUILD
@@ -745,6 +745,14 @@ py_test(
     ],
 )
 
+py_test(
+    name = "test_state_export",
+    size = "small",
+    srcs = ["tests/test_state_export.py"],
+    tags = ["team:ml", "exclusive"],
+    deps = [":train_lib", ":conftest"]
+)
+
 py_test(
     name = "test_tensorflow_checkpoint",
     size = "small",
diff --git a/python/ray/train/_internal/backend_executor.py b/python/ray/train/_internal/backend_executor.py
@@ -565,6 +565,9 @@ def initialize_session(
             from ray.train._internal.state.schema import RunStatusEnum
 
             core_context = ray.runtime_context.get_runtime_context()
+            controller_log_file_path = (
+                ray._private.worker.global_worker.get_err_file_path()
+            )
 
             self.state_manager.register_train_run(
                 run_id=self._trial_info.run_id,
@@ -575,6 +578,8 @@ def initialize_session(
                 worker_group=self.worker_group,
                 start_time_ms=self._start_time_ms,
                 run_status=RunStatusEnum.RUNNING,
+                controller_log_file_path=controller_log_file_path,
+                resources=[self._resources_per_worker] * self._num_workers,
             )
 
         # Run the training function asynchronously in its own thread.
diff --git a/python/ray/train/_internal/state/export.py b/python/ray/train/_internal/state/export.py
@@ -0,0 +1,104 @@
+from typing import Optional
+from ray.core.generated.export_train_state_pb2 import (
+    ExportTrainRunEventData as ProtoTrainRun,
+    ExportTrainRunAttemptEventData as ProtoTrainRunAttempt,
+)
+from ray.train._internal.state.schema import (
+    TrainRunInfo,
+    TrainWorkerInfo,
+    RunStatusEnum,
+    ActorStatusEnum,
+)
+
+
+TRAIN_SCHEMA_VERSION = 1
+
+# Status mapping dictionaries
+_ACTOR_STATUS_MAP = {
+    ActorStatusEnum.ALIVE: ProtoTrainRunAttempt.ActorStatus.ALIVE,
+    ActorStatusEnum.DEAD: ProtoTrainRunAttempt.ActorStatus.DEAD,
+}
+
+_RUN_ATTEMPT_STATUS_MAP = {
+    RunStatusEnum.STARTED: ProtoTrainRunAttempt.RunAttemptStatus.PENDING,
+    RunStatusEnum.RUNNING: ProtoTrainRunAttempt.RunAttemptStatus.RUNNING,
+    RunStatusEnum.FINISHED: ProtoTrainRunAttempt.RunAttemptStatus.FINISHED,
+    RunStatusEnum.ERRORED: ProtoTrainRunAttempt.RunAttemptStatus.ERRORED,
+    RunStatusEnum.ABORTED: ProtoTrainRunAttempt.RunAttemptStatus.ABORTED,
+}
+
+_RUN_STATUS_MAP = {
+    RunStatusEnum.STARTED: ProtoTrainRun.RunStatus.INITIALIZING,
+    RunStatusEnum.RUNNING: ProtoTrainRun.RunStatus.RUNNING,
+    RunStatusEnum.FINISHED: ProtoTrainRun.RunStatus.FINISHED,
+    RunStatusEnum.ERRORED: ProtoTrainRun.RunStatus.ERRORED,
+    RunStatusEnum.ABORTED: ProtoTrainRun.RunStatus.ABORTED,
+}
+
+
+def _ms_to_ns(ms: Optional[int]) -> Optional[int]:
+    if ms is None:
+        return None
+    return ms * 1000000
+
+
+# Helper conversion functions
+def _to_proto_resources(resources: dict) -> ProtoTrainRunAttempt.TrainResources:
+    """Convert resources dictionary to protobuf TrainResources."""
+    return ProtoTrainRunAttempt.TrainResources(resources=resources)
+
+
+def _to_proto_worker(worker: TrainWorkerInfo) -> ProtoTrainRunAttempt.TrainWorker:
+    """Convert TrainWorker to protobuf format."""
+    proto_worker = ProtoTrainRunAttempt.TrainWorker(
+        world_rank=worker.world_rank,
+        local_rank=worker.local_rank,
+        node_rank=worker.node_rank,
+        actor_id=bytes.fromhex(worker.actor_id),
+        node_id=bytes.fromhex(worker.node_id),
+        node_ip=worker.node_ip,
+        pid=worker.pid,
+        gpu_ids=worker.gpu_ids,
+        status=_ACTOR_STATUS_MAP[worker.status],
+        resources=_to_proto_resources(worker.resources),
+        log_file_path=worker.worker_log_file_path,
+    )
+
+    return proto_worker
+
+
+# Main conversion functions
+def train_run_info_to_proto_run(run_info: TrainRunInfo) -> ProtoTrainRun:
+    """Convert TrainRunInfo to TrainRun protobuf format."""
+    proto_run = ProtoTrainRun(
+        schema_version=TRAIN_SCHEMA_VERSION,
+        id=run_info.id,
+        name=run_info.name,
+        job_id=bytes.fromhex(run_info.job_id),
+        controller_actor_id=bytes.fromhex(run_info.controller_actor_id),
+        status=_RUN_STATUS_MAP[run_info.run_status],
+        status_detail=run_info.status_detail,
+        start_time_ns=_ms_to_ns(run_info.start_time_ms),
+        end_time_ns=_ms_to_ns(run_info.end_time_ms),
+        controller_log_file_path=run_info.controller_log_file_path,
+    )
+
+    return proto_run
+
+
+def train_run_info_to_proto_attempt(run_info: TrainRunInfo) -> ProtoTrainRunAttempt:
+    """Convert TrainRunInfo to TrainRunAttempt protobuf format."""
+
+    proto_attempt = ProtoTrainRunAttempt(
+        schema_version=TRAIN_SCHEMA_VERSION,
+        run_id=run_info.id,
+        attempt_id=run_info.id,  # Same as run_id
+        status=_RUN_ATTEMPT_STATUS_MAP[run_info.run_status],
+        status_detail=run_info.status_detail,
+        start_time_ns=_ms_to_ns(run_info.start_time_ms),
+        end_time_ns=_ms_to_ns(run_info.end_time_ms),
+        resources=[_to_proto_resources(r) for r in run_info.resources],
+        workers=[_to_proto_worker(worker) for worker in run_info.workers],
+    )
+
+    return proto_attempt
diff --git a/python/ray/train/_internal/state/schema.py b/python/ray/train/_internal/state/schema.py
@@ -1,5 +1,5 @@
 from enum import Enum
-from typing import List, Optional
+from typing import Dict, List, Optional
 
 from ray._private.pydantic_compat import BaseModel, Field
 from ray.dashboard.modules.job.pydantic_models import JobDetails
@@ -47,9 +47,13 @@ class TrainWorkerInfo(BaseModel):
     gpu_ids: List[int] = Field(
         description="A list of GPU ids allocated to that worker."
     )
-    status: Optional[ActorStatusEnum] = Field(
+    status: ActorStatusEnum = Field(
         description="The status of the train worker actor. It can be ALIVE or DEAD."
     )
+    resources: Dict[str, float] = Field(
+        description="The resources allocated to the worker."
+    )
+    worker_log_file_path: str = Field(description="The path to the worker log file.")
 
 
 @DeveloperAPI
@@ -139,6 +143,12 @@ class TrainRunInfo(BaseModel):
         description="The UNIX timestamp of the end time of this Train run. "
         "If null, the Train run has not ended yet."
     )
+    controller_log_file_path: str = Field(
+        description="The path to the controller log file."
+    )
+    resources: List[Dict[str, float]] = Field(
+        description="The resources allocated to the worker."
+    )
 
 
 @DeveloperAPI
diff --git a/python/ray/train/_internal/state/state_actor.py b/python/ray/train/_internal/state/state_actor.py
@@ -1,8 +1,14 @@
 import logging
+import os
 import threading
 from typing import Dict, Optional
 
 import ray
+from ray._private.event.export_event_logger import (
+    EventLogType,
+    get_export_event_logger,
+    check_export_api_enabled,
+)
 from ray.actor import ActorHandle
 from ray.train._internal.state.schema import TrainRunInfo
 
@@ -14,10 +20,19 @@ class TrainStateActor:
     def __init__(self):
         self._run_infos: Dict[str, TrainRunInfo] = {}
 
+        (
+            self._export_logger,
+            self._is_train_run_export_api_enabled,
+            self._is_train_run_attempt_export_api_enabled,
+        ) = self._init_export_logger()
+
     def register_train_run(self, run_info: TrainRunInfo) -> None:
         # Register a new train run.
         self._run_infos[run_info.id] = run_info
 
+        self._maybe_export_train_run(run_info)
+        self._maybe_export_train_run_attempt(run_info)
+
     def get_train_run(self, run_id: str) -> Optional[TrainRunInfo]:
         # Retrieve a registered run with its id
         return self._run_infos.get(run_id, None)
@@ -26,6 +41,81 @@ def get_all_train_runs(self) -> Dict[str, TrainRunInfo]:
         # Retrieve all registered train runs
         return self._run_infos
 
+    # ============================
+    # Export API
+    # ============================
+
+    def is_export_api_enabled(self) -> bool:
+        return self._export_logger is not None
+
+    def _init_export_logger(self) -> tuple[Optional[logging.Logger], bool, bool]:
+        """Initialize the export logger and check if the export API is enabled.
+
+        Returns:
+            A tuple containing:
+                - The export logger (or None if export API is not enabled).
+                - A boolean indicating if the export API is enabled for train runs.
+                - A boolean indicating if the export API is enabled for train run attempts.
+        """
+        # Proto schemas should be imported within the scope of TrainStateActor to
+        # prevent serialization errors.
+        from ray.core.generated.export_event_pb2 import ExportEvent
+
+        is_train_run_export_api_enabled = check_export_api_enabled(
+            ExportEvent.SourceType.EXPORT_TRAIN_RUN
+        )
+        is_train_run_attempt_export_api_enabled = check_export_api_enabled(
+            ExportEvent.SourceType.EXPORT_TRAIN_RUN_ATTEMPT
+        )
+        export_api_enabled = (
+            is_train_run_export_api_enabled or is_train_run_attempt_export_api_enabled
+        )
+
+        if not export_api_enabled:
+            return None, False, False
+
+        log_directory = os.path.join(
+            ray._private.worker._global_node.get_session_dir_path(), "logs"
+        )
+        logger = None
+        try:
+            logger = get_export_event_logger(
+                EventLogType.TRAIN_STATE,
+                log_directory,
+            )
+        except Exception:
+            logger.exception(
+                "Unable to initialize the export event logger, so no Train export "
+                "events will be written."
+            )
+
+        if logger is None:
+            return None, False, False
+
+        return (
+            logger,
+            is_train_run_export_api_enabled,
+            is_train_run_attempt_export_api_enabled,
+        )
+
+    def _maybe_export_train_run(self, run_info: TrainRunInfo) -> None:
+        if not self._is_train_run_export_api_enabled:
+            return
+
+        from ray.train._internal.state.export import train_run_info_to_proto_run
+
+        run_proto = train_run_info_to_proto_run(run_info)
+        self._export_logger.send_event(run_proto)
+
+    def _maybe_export_train_run_attempt(self, run_info: TrainRunInfo) -> None:
+        if not self._is_train_run_attempt_export_api_enabled:
+            return
+
+        from ray.train._internal.state.export import train_run_info_to_proto_attempt
+
+        run_attempt_proto = train_run_info_to_proto_attempt(run_info)
+        self._export_logger.send_event(run_attempt_proto)
+
 
 TRAIN_STATE_ACTOR_NAME = "train_state_actor"
 TRAIN_STATE_ACTOR_NAMESPACE = "_train_state_actor"
diff --git a/python/ray/train/_internal/state/state_manager.py b/python/ray/train/_internal/state/state_manager.py
@@ -1,11 +1,12 @@
 import logging
 import os
 from collections import defaultdict
-from typing import Any, Dict
+from typing import Any, Dict, List
 
 import ray
 from ray.data import Dataset
 from ray.train._internal.state.schema import (
+    ActorStatusEnum,
     RunStatusEnum,
     TrainDatasetInfo,
     TrainRunInfo,
@@ -37,6 +38,8 @@ def register_train_run(
         datasets: Dict[str, Dataset],
         worker_group: WorkerGroup,
         start_time_ms: float,
+        controller_log_file_path: str,
+        resources: List[Dict[str, float]],
         status_detail: str = "",
     ) -> None:
         """Collect Train Run Info and report to StateActor."""
@@ -50,7 +53,7 @@ def register_train_run(
         def collect_train_worker_info():
             train_context = ray.train.get_context()
             core_context = ray.runtime_context.get_runtime_context()
-
+            worker_log_file_path = ray._private.worker.global_worker.get_err_file_path()
             return TrainWorkerInfo(
                 world_rank=train_context.get_world_rank(),
                 local_rank=train_context.get_local_rank(),
@@ -60,6 +63,9 @@ def collect_train_worker_info():
                 node_ip=ray.util.get_node_ip_address(),
                 gpu_ids=ray.get_gpu_ids(),
                 pid=os.getpid(),
+                resources=resources[0],
+                worker_log_file_path=worker_log_file_path,
+                status=ActorStatusEnum.ALIVE,
             )
 
         futures = [
@@ -97,6 +103,8 @@ def collect_train_worker_info():
             start_time_ms=start_time_ms,
             run_status=run_status,
             status_detail=status_detail,
+            controller_log_file_path=controller_log_file_path,
+            resources=resources,
         )
 
         # Clear the cached info to avoid registering the same run twice
diff --git a/python/ray/train/tests/test_state.py b/python/ray/train/tests/test_state.py
diff --git a/python/ray/train/tests/test_state_export.py b/python/ray/train/tests/test_state_export.py
diff --git a/python/ray/train/v2/_internal/state/export.py b/python/ray/train/v2/_internal/state/export.py