feat(truss): Add Metrics watching to truss train (#1574)

rcano-baseten · web-flow · commit 02be8a43776e · 2025-04-10T20:47:46.000-05:00
* metrics start

* working

* TrainingPoller

* small updates

* small refactor

* precommit

* remove storage

* rename common func

* dynamic coloring

* offset minutes

* move files, offset minutes

* i think this is it

* add unit test with heavy mocking

* pr review

* test train core

* remove unnecessary codes
diff --git a/truss/cli/cli.py b/truss/cli/cli.py
@@ -21,6 +21,7 @@
 from rich.console import Console
 
 import truss
+import truss.cli.train.core as train_cli
 from truss.base.constants import (
     PRODUCTION_ENVIRONMENT_NAME,
     TRTLLM_MIN_MEMORY_REQUEST_GI,
@@ -32,12 +33,6 @@
 from truss.cli.logs import utils as cli_log_utils
 from truss.cli.logs.model_log_watcher import ModelDeploymentLogWatcher
 from truss.cli.logs.training_log_watcher import TrainingLogWatcher
-from truss.cli.train import (
-    get_args_for_logs,
-    get_args_for_stop,
-    stop_all_jobs,
-    view_training_details,
-)
 from truss.remote.baseten.core import (
     ACTIVE_STATUS,
     DEPLOYING_STATUSES,
@@ -925,7 +920,9 @@ def push_training_job(config: Path, remote: Optional[str], tail: bool):
         console.print("✨ Training job successfully created!", style="green")
         console.print(
             f"🪵 View logs for your job via "
-            f"[cyan]`truss train logs --project-id {project_resp['id']} --job-id {job_resp['id']} [--tail]`[/cyan]"
+            f"[cyan]`truss train logs --job-id {job_resp['id']} [--tail]`[/cyan]\n"
+            f"🔍 View metrics for your job via "
+            f"[cyan]`truss train metrics --job-id {job_resp['id']}`[/cyan]"
         )
 
     if tail:
@@ -953,7 +950,9 @@ def get_job_logs(
     remote_provider: BasetenRemote = cast(
         BasetenRemote, RemoteFactory.create(remote=remote)
     )
-    project_id, job_id = get_args_for_logs(console, remote_provider, project_id, job_id)
+    project_id, job_id = train_cli.get_args_for_monitoring(
+        console, remote_provider, project_id, job_id
+    )
 
     if not tail:
         logs = remote_provider.api.get_training_job_logs(project_id, job_id)
@@ -986,9 +985,9 @@ def stop_job(
         BasetenRemote, RemoteFactory.create(remote=remote)
     )
     if all:
-        stop_all_jobs(console, remote_provider, project_id)
+        train_cli.stop_all_jobs(console, remote_provider, project_id)
     else:
-        project_id, job_id = get_args_for_stop(
+        project_id, job_id = train_cli.get_args_for_stop(
             console, remote_provider, project_id, job_id
         )
         remote_provider.api.stop_training_job(project_id, job_id)
@@ -1016,7 +1015,27 @@ def view_training(
     remote_provider: BasetenRemote = cast(
         BasetenRemote, RemoteFactory.create(remote=remote)
     )
-    view_training_details(console, remote_provider, project_id, job_id)
+    train_cli.view_training_details(console, remote_provider, project_id, job_id)
+
+
+@train.command(name="metrics")
+@click.option("--project-id", type=str, required=False, help="Project ID.")
+@click.option("--job-id", type=str, required=False, help="Job ID.")
+@click.option("--remote", type=str, required=False, help="Remote to use")
+@log_level_option
+@error_handling
+def get_job_metrics(
+    project_id: Optional[str], job_id: Optional[str], remote: Optional[str]
+):
+    """Get metrics for a training job"""
+
+    if not remote:
+        remote = remote_cli.inquire_remote_name()
+
+    remote_provider: BasetenRemote = cast(
+        BasetenRemote, RemoteFactory.create(remote=remote)
+    )
+    train_cli.view_training_job_metrics(console, remote_provider, project_id, job_id)
 
 
 # End Training Stuff #####################################################################
diff --git a/truss/cli/common.py b/truss/cli/common.py
@@ -0,0 +1 @@
+POLL_INTERVAL_SEC = 2
diff --git a/truss/cli/logs/base_watcher.py b/truss/cli/logs/base_watcher.py
@@ -5,12 +5,12 @@
 
 from rich import console as rich_console
 
+from truss.cli.common import POLL_INTERVAL_SEC
 from truss.cli.logs.utils import ParsedLog, parse_logs
 from truss.remote.baseten.api import BasetenApi
 
 # NB(nikhil): This helps account for (1) log processing delays (2) clock skews
 CLOCK_SKEW_BUFFER_MS = 10000
-POLL_INTERVAL_SEC = 2
 
 
 class LogWatcher(ABC):
diff --git a/truss/cli/logs/training_log_watcher.py b/truss/cli/logs/training_log_watcher.py
@@ -1,30 +1,16 @@
 import signal
-import time
 from typing import Any, List, Optional
 
 from rich import console as rich_console
 
-from truss.cli.logs.base_watcher import POLL_INTERVAL_SEC, LogWatcher
+from truss.cli.logs.base_watcher import LogWatcher
+from truss.cli.train.poller import TrainingPollerMixin
 from truss.remote.baseten.api import BasetenApi
 
-# NB(nikhil): When a job ends, we poll for this many seconds after to capture
-# any trailing logs that contain information about errors.
-JOB_TERMINATION_GRACE_PERIOD_SEC = 10
 
-JOB_STARTING_STATES = ["TRAINING_JOB_CREATED", "TRAINING_JOB_DEPLOYING"]
-JOB_RUNNING_STATES = ["TRAINING_JOB_RUNNING"]
-JOB_ENDED_STATES = [
-    "TRAINING_JOB_COMPLETED",
-    "TRAINING_JOB_FAILED",
-    "TRAINING_JOB_STOPPED",
-]
-
-
-class TrainingLogWatcher(LogWatcher):
+class TrainingLogWatcher(TrainingPollerMixin, LogWatcher):
     project_id: str
     job_id: str
-    _poll_stop_time: Optional[int] = None
-    _current_status: Optional[str] = None
 
     def __init__(
         self,
@@ -33,60 +19,19 @@ def __init__(
         job_id: str,
         console: rich_console.Console,
     ):
-        super().__init__(api, console)
-        self.project_id = project_id
-        self.job_id = job_id
-        # register siging handler that instructs user on how to stop the job
+        TrainingPollerMixin.__init__(self, api, project_id, job_id, console)
+        LogWatcher.__init__(self, api, console)
+        # registering the sigint allows us to provide messaging on next steps
         signal.signal(signal.SIGINT, self._handle_sigint)
 
     def _handle_sigint(self, signum: int, frame: Any) -> None:
-        msg = f"\n\nExiting training job logs. To stop the job, run `truss train stop --project-id {self.project_id} --job-id {self.job_id}`"
+        msg = f"\n\nExiting training job logs. To stop the job, run `truss train stop --job-id {self.job_id}`"
         self.console.print(msg, style="yellow")
         raise KeyboardInterrupt()
 
-    def _get_current_job_status(self) -> str:
-        job = self.api.get_training_job(self.project_id, self.job_id)
-        return job["training_job"]["current_status"]
-
-    def before_polling(self) -> None:
-        self._current_status = self._get_current_job_status()
-        status_str = "Waiting for job to run, currently {current_status}..."
-        with self.console.status(
-            status_str.format(current_status=self._current_status), spinner="dots"
-        ) as status:
-            while self._current_status in JOB_STARTING_STATES:
-                time.sleep(POLL_INTERVAL_SEC)
-                self._current_status = self._get_current_job_status()
-                status.update(status_str.format(current_status=self._current_status))
-
     def fetch_logs(
         self, start_epoch_millis: Optional[int], end_epoch_millis: Optional[int]
     ) -> List[Any]:
         return self.api.get_training_job_logs(
             self.project_id, self.job_id, start_epoch_millis, end_epoch_millis
         )
-
-    def should_poll_again(self) -> bool:
-        return self._current_status in JOB_RUNNING_STATES or self._poll_final_logs()
-
-    def post_poll(self) -> None:
-        self._current_status = self._get_current_job_status()
-        self._maybe_update_poll_stop_time(self._current_status)
-
-    def after_polling(self) -> None:
-        if self._current_status == "TRAINING_JOB_COMPLETED":
-            self.console.print("Training job completed successfully.", style="green")
-        elif self._current_status == "TRAINING_JOB_FAILED":
-            self.console.print("Training job failed.", style="red")
-        elif self._current_status == "TRAINING_JOB_STOPPED":
-            self.console.print("Training job stopped by user.", style="yellow")
-
-    def _poll_final_logs(self):
-        if self._poll_stop_time is None:
-            return False
-
-        return int(time.time()) <= self._poll_stop_time
-
-    def _maybe_update_poll_stop_time(self, current_status: str) -> None:
-        if current_status not in JOB_RUNNING_STATES and self._poll_stop_time is None:
-            self._poll_stop_time = int(time.time()) + JOB_TERMINATION_GRACE_PERIOD_SEC
diff --git a/truss/cli/train/core.py b/truss/cli/train/core.py
@@ -5,6 +5,7 @@
 from InquirerPy import inquirer
 from rich.console import Console
 
+from truss.cli.train.metrics_watcher import MetricsWatcher
 from truss.remote.baseten.remote import BasetenRemote
 
 ACTIVE_JOB_STATUSES = [
@@ -38,7 +39,7 @@ def get_args_for_stop(
     return project_id, job_id
 
 
-def get_args_for_logs(
+def get_args_for_monitoring(
     console: Console,
     remote_provider: BasetenRemote,
     project_id: Optional[str],
@@ -49,12 +50,12 @@ def get_args_for_logs(
             project_id=project_id, job_id=job_id
         )
         if not jobs:
-            raise click.UsageError("Unable to get logs. No jobs found.")
+            raise click.UsageError("No jobs found.")
         if len(jobs) > 1:
             sorted_jobs = sorted(jobs, key=lambda x: x["created_at"], reverse=True)
             job = sorted_jobs[0]
             console.print(
-                f"Multiple jobs found. Showing logs for the most recently created job: {job['id']}",
+                f"Multiple jobs found. Showing the most recently created job: {job['id']}",
                 style="yellow",
             )
         else:
@@ -187,3 +188,19 @@ def stop_all_jobs(
     for job in active_jobs:
         remote_provider.api.stop_training_job(job["training_project"]["id"], job["id"])
     console.print("Training jobs stopped successfully.", style="green")
+
+
+def view_training_job_metrics(
+    console: Console,
+    remote_provider: BasetenRemote,
+    project_id: Optional[str],
+    job_id: Optional[str],
+):
+    """
+    view_training_job_metrics shows a list of metrics for a training job.
+    """
+    project_id, job_id = get_args_for_monitoring(
+        console, remote_provider, project_id, job_id
+    )
+    metrics_display = MetricsWatcher(remote_provider.api, project_id, job_id, console)
+    metrics_display.watch()
diff --git a/truss/cli/train/metrics_watcher.py b/truss/cli/train/metrics_watcher.py
@@ -0,0 +1,147 @@
+import signal
+import time
+import traceback
+from typing import Any, Dict, List, Optional, Tuple
+
+from rich.console import Console
+from rich.live import Live
+from rich.table import Table
+from rich.text import Text
+
+from truss.cli.train.poller import TrainingPollerMixin
+from truss.remote.baseten.api import BasetenApi
+
+METRICS_POLL_INTERVAL_SEC = 30
+
+
+class MetricsWatcher(TrainingPollerMixin):
+    live: Optional[Live]
+
+    def __init__(self, api: BasetenApi, project_id: str, job_id: str, console: Console):
+        super().__init__(api, project_id, job_id, console)
+
+        self.live = None
+        signal.signal(signal.SIGINT, self._handle_sigint)
+
+    def _handle_sigint(self, signum: int, frame: Any) -> None:
+        if self.live:
+            self.live.stop()
+        msg = f"\n\nExiting training job metrics. To stop the job, run `truss train stop --job-id {self.job_id}`"
+        self.console.print(msg, style="yellow")
+        raise KeyboardInterrupt()
+
+    def _format_bytes(self, bytes_val: float) -> Tuple[str, str]:
+        """Convert bytes to human readable format"""
+        color_map = {"MB": "green", "GB": "cyan", "TB": "magenta"}
+        unit = "MB"
+        if bytes_val > 1024 * 1024 * 1024 * 1024:
+            unit = "TB"
+        elif bytes_val > 1024 * 1024 * 1024:
+            unit = "GB"
+
+        if unit == "MB":
+            return f"{bytes_val / (1024 * 1024):.2f} MB", color_map[unit]
+        elif unit == "GB":
+            return f"{bytes_val / (1024 * 1024 * 1024):.2f} GB", color_map[unit]
+        return f"{bytes_val:.2f} bytes", color_map[unit]
+
+    def _get_latest_metric(self, metrics: List[Dict]) -> Optional[float]:
+        """Get the most recent metric value"""
+        if not metrics:
+            return None
+        return metrics[-1].get("value")
+
+    def create_metrics_table(self, metrics_data: Dict) -> Table:
+        """Create a Rich table with the metrics"""
+        table = Table(title="Training Job Metrics")
+        table.add_column("Metric")
+        table.add_column("Value")
+
+        # Add timestamp if available
+        cpu_usage_data = metrics_data.get("cpu_usage", [])
+        if cpu_usage_data and len(cpu_usage_data) > 0:
+            latest_timestamp = cpu_usage_data[-1].get("timestamp")
+            if latest_timestamp:
+                table.add_row("Timestamp", latest_timestamp)
+                table.add_section()
+
+        # CPU metrics
+        cpu_usage = self._get_latest_metric(metrics_data.get("cpu_usage", []))
+        if cpu_usage is not None:
+            table.add_row("CPU Usage", f"{cpu_usage:.2f} cores")
+
+        cpu_memory = self._get_latest_metric(
+            metrics_data.get("cpu_memory_usage_bytes", [])
+        )
+        if cpu_memory is not None:
+            formatted_value, color = self._format_bytes(cpu_memory)
+            table.add_row("CPU Memory", Text(formatted_value, style=color))
+
+        # Add separator after CPU metrics
+        table.add_section()
+
+        # GPU metrics - grouped by GPU ID
+        gpu_metrics = metrics_data.get("gpu_utilization", {})
+        gpu_memory = metrics_data.get("gpu_memory_usage_bytes", {})
+
+        for gpu_id in sorted(set(gpu_metrics.keys()) | set(gpu_memory.keys())):
+            # Add GPU utilization
+            latest_util = self._get_latest_metric(gpu_metrics.get(gpu_id, []))
+            if latest_util is not None:
+                table.add_row(f"GPU {gpu_id} Usage", f"{latest_util * 100:.1f}%")
+
+            # Add GPU memory right after its utilization
+            latest_memory = self._get_latest_metric(gpu_memory.get(gpu_id, []))
+            if latest_memory is not None:
+                formatted_value, color = self._format_bytes(latest_memory)
+                table.add_row(
+                    f"GPU {gpu_id} Memory", Text(formatted_value, style=color)
+                )
+
+            # Add separator after each GPU's metrics (except for the last one)
+            if gpu_id != max(set(gpu_metrics.keys()) | set(gpu_memory.keys())):
+                table.add_section()
+
+        # Add separator before storage metrics
+        if gpu_metrics or gpu_memory:
+            table.add_section()
+
+        return table
+
+    def watch(self, refresh_rate: int = METRICS_POLL_INTERVAL_SEC):
+        """Display continuously updating metrics"""
+        self.before_polling()
+        with Live(auto_refresh=False) as live:
+            self.live = live
+            while True:
+                # our first instance of fetching metrics passes no explicit time range. We do this so that we can fetch metrics
+                # for inactive jobs, using the job's completion time to set the time range.
+                # Subsequent queries will fetch only the most recent data to avoid unnecessary load on VM
+                metrics = self.api.get_training_job_metrics(
+                    self.project_id, self.job_id
+                )
+                try:
+                    # range of one minute since we only want the last recording
+                    table = self.create_metrics_table(metrics)
+                    live.update(table, refresh=True)
+                    if not self.should_poll_again():
+                        live.stop()
+                        break
+                    time.sleep(refresh_rate)
+                    end_epoch_millis = int(time.time() * 1000)
+                    start_epoch_millis = end_epoch_millis - 60 * 1000
+                    metrics = self.api.get_training_job_metrics(
+                        self.project_id,
+                        self.job_id,
+                        end_epoch_millis=end_epoch_millis,
+                        start_epoch_millis=start_epoch_millis,
+                    )
+                    self.post_poll()
+                except Exception as e:
+                    live.stop()
+                    self.console.print(
+                        f"Error fetching metrics: {e}: {traceback.format_exc()}",
+                        style="red",
+                    )
+                    break
+        self.after_polling()
diff --git a/truss/cli/train/poller.py b/truss/cli/train/poller.py
diff --git a/truss/remote/baseten/api.py b/truss/remote/baseten/api.py
diff --git a/truss/tests/cli/train/test_train_cli_core.py b/truss/tests/cli/train/test_train_cli_core.py