kubeflow · google-oss-prow · Sep 3, 2025 · Sep 3, 2025 · Sep 3, 2025 · Sep 3, 2025
diff --git a/kubeflow/trainer/api/trainer_client.py b/kubeflow/trainer/api/trainer_client.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import logging
-from typing import Optional, Union
+from typing import Optional, Union, Iterator
 
 from kubeflow.trainer.constants import constants
 from kubeflow.trainer.types import types
@@ -120,8 +120,7 @@ def list_jobs(self, runtime: Optional[types.Runtime] = None) -> list[types.Train
             runtime: Reference to one of the existing runtimes.
 
         Returns:
-            List: List of created TrainJobs.
-                If no TrainJob exist, an empty list is returned.
+            List of created TrainJobs. If no TrainJob exist, an empty list is returned.
 
         Raises:
             TimeoutError: Timeout to list TrainJobs.
@@ -148,12 +147,33 @@ def get_job(self, name: str) -> types.TrainJob:
     def get_job_logs(
         self,
         name: str,
+        step: str = constants.NODE + "-0",
         follow: Optional[bool] = False,
-        step: str = constants.NODE,
-        node_rank: int = 0,
-    ) -> dict[str, str]:
-        """Get the logs from TrainJob"""
-        return self.backend.get_job_logs(name=name, follow=follow, step=step, node_rank=node_rank)
+    ) -> Iterator[str]:
+        """Get logs from a specific step of a TrainJob.
+
+        You can watch for the logs in realtime as follows:
+        ```python
+        from kubeflow.trainer import TrainerClient
+
+        for logline in TrainerClient().get_job_logs(name="s8d44aa4fb6d", follow=True):
+            print(logline)
+        ```
+
+        Args:
+            name: Name of the TrainJob.
+            step: Step of the TrainJob to collect logs from, like dataset-initializer or node-0.
+            follow: Whether to stream logs in realtime as they produced.
+
+        Returns:
+            Iterator of log lines.
+
+
+        Raises:
+            TimeoutError: Timeout to get a TrainJob.
+            RuntimeError: Failed to get a TrainJob.
+        """
+        return self.backend.get_job_logs(name=name, follow=follow, step=step)
 
     def wait_for_job_status(
         self,

diff --git a/kubeflow/trainer/backends/base.py b/kubeflow/trainer/backends/base.py
@@ -14,7 +14,7 @@
 
 import abc
 
-from typing import Optional, Union
+from typing import Optional, Union, Iterator
 from kubeflow.trainer.constants import constants
 from kubeflow.trainer.types import types
 
@@ -47,9 +47,8 @@ def get_job_logs(
         self,
         name: str,
         follow: Optional[bool] = False,
-        step: str = constants.NODE,
-        node_rank: int = 0,
-    ) -> dict[str, str]:
+        step: str = constants.NODE + "-0",
+    ) -> Iterator[str]:
         raise NotImplementedError()
 
     def wait_for_job_status(

diff --git a/kubeflow/trainer/backends/kubernetes/backend.py b/kubeflow/trainer/backends/kubernetes/backend.py
@@ -15,12 +15,12 @@
 import copy
 import logging
 import multiprocessing
-import queue
 import random
 import string
 import time
 import uuid
-from typing import Optional, Union
+from typing import Optional, Union, Iterator
+import re
 
 from kubeflow.trainer.constants import constants
 from kubeflow.trainer.types import types
@@ -173,7 +173,7 @@ def print_packages():
         )
 
         self.wait_for_job_status(job_name)
-        print(self.get_job_logs(job_name)["node-0"])
+        print(self.get_job_logs(job_name))
         self.delete_job(job_name)
 
     def train(
@@ -328,92 +328,48 @@ def get_job_logs(
         self,
         name: str,
         follow: Optional[bool] = False,
-        step: str = constants.NODE,
-        node_rank: int = 0,
-    ) -> dict[str, str]:
-        """Get the logs from TrainJob"""
-
+        step: str = constants.NODE + "-0",
+    ) -> Iterator[str]:
+        """Get the TrainJob logs"""
         # Get the TrainJob Pod name.
         pod_name = None
         for c in self.get_job(name).steps:
-            if c.status != constants.POD_PENDING:
-                if c.name == step or c.name == f"{step}-{node_rank}":
-                    pod_name = c.pod_name
+            if c.status != constants.POD_PENDING and c.name == step:
+                pod_name = c.pod_name
+                break
         if pod_name is None:
-            return {}
-
-        # Dict where key is the Pod type and value is the Pod logs.
-        logs_dict = {}
-
-        # TODO (andreyvelich): Potentially, refactor this.
-        # Support logging of multiple Pods.
-        # TODO (andreyvelich): Currently, follow is supported only for node container.
-        if follow and step == constants.NODE:
-            log_streams = []
-            log_streams.append(
-                watch.Watch().stream(
+            return iter([])
+
+        try:
+            if follow:
+                log_stream = watch.Watch().stream(
                     self.core_api.read_namespaced_pod_log,
                     name=pod_name,
                     namespace=self.namespace,
-                    container=constants.NODE,
+                    container=re.sub(r"-\d+$", "", step),  # Remove the number for the node step.
+                    follow=True,
                 )
-            )
-            finished = [False] * len(log_streams)
 
-            # Create thread and queue per stream, for non-blocking iteration.
-            log_queue_pool = utils.get_log_queue_pool(log_streams)
-
-            # Iterate over every watching pods' log queue
-            while True:
-                for index, log_queue in enumerate(log_queue_pool):
-                    if all(finished):
+                # Stream logs incrementally
+                for logline in log_stream:
+                    if logline is None:
                         break
-                    if finished[index]:
-                        continue
-                    # grouping the every 50 log lines of the same pod.
-                    for _ in range(50):
-                        try:
-                            logline = log_queue.get(timeout=1)
-                            if logline is None:
-                                finished[index] = True
-                                break
-                            # Print logs to the StdOut and update results dict.
-                            print(f"[{step}-{node_rank}]: {logline}")
-                            logs_dict[f"{step}-{node_rank}"] = (
-                                logs_dict.get(f"{step}-{node_rank}", "") + logline + "\n"
-                            )
-                        except queue.Empty:
-                            break
-                if all(finished):
-                    return logs_dict
-
-        try:
-            if step == constants.DATASET_INITIALIZER:
-                logs_dict[constants.DATASET_INITIALIZER] = self.core_api.read_namespaced_pod_log(
-                    name=pod_name,
-                    namespace=self.namespace,
-                    container=constants.DATASET_INITIALIZER,
-                )
-            elif step == constants.MODEL_INITIALIZER:
-                logs_dict[constants.MODEL_INITIALIZER] = self.core_api.read_namespaced_pod_log(
-                    name=pod_name,
-                    namespace=self.namespace,
-                    container=constants.MODEL_INITIALIZER,
-                )
+                    yield logline  # type:ignore
             else:
-                logs_dict[f"{step}-{node_rank}"] = self.core_api.read_namespaced_pod_log(
+                logs = self.core_api.read_namespaced_pod_log(
                     name=pod_name,
                     namespace=self.namespace,
-                    container=constants.NODE,
+                    container=re.sub(r"-\d+$", "", step),  # Remove the number for the node step.
                 )
 
+                for line in logs.splitlines():
+                    yield line
+
         except Exception as e:
             raise RuntimeError(
                 f"Failed to read logs for the pod {self.namespace}/{pod_name}"
             ) from e
 
-        return logs_dict
-
     def wait_for_job_status(
         self,
         name: str,

diff --git a/kubeflow/trainer/backends/kubernetes/backend_test.py b/kubeflow/trainer/backends/kubernetes/backend_test.py
@@ -917,14 +917,12 @@ def test_list_jobs(trainer_client, test_case):
             name="valid flow with all defaults",
             expected_status=SUCCESS,
             config={"name": BASIC_TRAIN_JOB_NAME},
-            expected_output={
-                "node-0": "test log content",
-            },
+            expected_output=["test log content"],
         ),
         TestCase(
             name="runtime error when getting logs",
             expected_status=FAILED,
-            config={"name": RUNTIME},
+            config={"name": BASIC_TRAIN_JOB_NAME, "namespace": FAIL_LOGS},
             expected_error=RuntimeError,
         ),
     ],
@@ -933,10 +931,12 @@ def test_get_job_logs(trainer_client, test_case):
     """Test TrainerClient.get_job_logs with basic success path."""
     print("Executing test:", test_case.name)
     try:
+        trainer_client.namespace = test_case.config.get("namespace", DEFAULT_NAMESPACE)
-        trainer_client.namespace = test_case.config.get("namespace", DEFAULT_NAMESPACE)
+        trainer_client.backend.namespace = test_case.config.get("namespace", DEFAULT_NAMESPACE)
 yield KubernetesBackend(KubernetesBackendConfig()) 
-        trainer_client.namespace = test_case.config.get("namespace", DEFAULT_NAMESPACE)
+        trainer_client.backend.namespace = test_case.config.get("namespace", DEFAULT_NAMESPACE)
 yield KubernetesBackend(KubernetesBackendConfig()) 
         logs = trainer_client.get_job_logs(test_case.config.get("name"))
+        # Convert iterator to list for comparison.
+        logs_list = list(logs)
         assert test_case.expected_status == SUCCESS
-        assert logs == test_case.expected_output
-
+        assert logs_list == test_case.expected_output
     except Exception as e:
         assert type(e) is test_case.expected_error
     print("test execution complete")

diff --git a/kubeflow/trainer/utils/utils.py b/kubeflow/trainer/utils/utils.py
@@ -14,9 +14,7 @@
 
 import inspect
 import os
-import queue
 import textwrap
-import threading
 from typing import Any, Callable, Optional
 from urllib.parse import urlparse
 
@@ -571,22 +569,3 @@ def get_model_initializer(
     )
 
     return model_initializer
-
-
-def wrap_log_stream(q: queue.Queue, log_stream: Any):
-    while True:
-        try:
-            logline = next(log_stream)
-            q.put(logline)
-        except StopIteration:
-            q.put(None)
-            return
-
-
-def get_log_queue_pool(log_streams: list[Any]) -> list[queue.Queue]:
-    pool = []
-    for log_stream in log_streams:
-        q = queue.Queue(maxsize=100)
-        pool.append(q)
-        threading.Thread(target=wrap_log_stream, args=(q, log_stream)).start()
-    return pool