From 3524abcfcbae621585fe39182a9a1ad145ce0601 Mon Sep 17 00:00:00 2001 From: Saad Zaher Date: Sat, 21 Jun 2025 15:05:28 +0100 Subject: [PATCH 01/33] Implement TrainerClient Backends & Local Process Signed-off-by: Saad Zaher --- python/kubeflow/trainer/api/trainer_client.py | 604 ++---------------- python/kubeflow/trainer/backends/__init__.py | 28 + python/kubeflow/trainer/backends/base.py | 62 ++ python/kubeflow/trainer/backends/k8s.py | 572 +++++++++++++++++ .../trainer/backends/local_process.py | 190 ++++++ .../kubeflow/trainer/constants/constants.py | 14 + python/kubeflow/trainer/local/__init__.py | 0 python/kubeflow/trainer/local/job.py | 95 +++ python/kubeflow/trainer/local/runtimes.py | 31 + python/kubeflow/trainer/types/backends.py | 34 + python/kubeflow/trainer/types/local.py | 54 ++ python/kubeflow/trainer/types/types.py | 2 +- python/kubeflow/trainer/utils/local.py | 163 +++++ 13 files changed, 1290 insertions(+), 559 deletions(-) create mode 100644 python/kubeflow/trainer/backends/__init__.py create mode 100644 python/kubeflow/trainer/backends/base.py create mode 100644 python/kubeflow/trainer/backends/k8s.py create mode 100644 python/kubeflow/trainer/backends/local_process.py create mode 100644 python/kubeflow/trainer/local/__init__.py create mode 100644 python/kubeflow/trainer/local/job.py create mode 100644 python/kubeflow/trainer/local/runtimes.py create mode 100644 python/kubeflow/trainer/types/backends.py create mode 100644 python/kubeflow/trainer/types/local.py create mode 100644 python/kubeflow/trainer/utils/local.py diff --git a/python/kubeflow/trainer/api/trainer_client.py b/python/kubeflow/trainer/api/trainer_client.py index 33bd78749..62347bc83 100644 --- a/python/kubeflow/trainer/api/trainer_client.py +++ b/python/kubeflow/trainer/api/trainer_client.py @@ -1,4 +1,4 @@ -# Copyright 2024 The Kubeflow Authors. +# Copyright 2024-2025 The Kubeflow Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,585 +13,73 @@ # limitations under the License. import logging -import multiprocessing -import queue import random import string import uuid -from typing import Dict, List, Optional +from typing import Optional -from kubeflow_trainer_api import models from kubeflow.trainer.constants import constants from kubeflow.trainer.types import types -from kubeflow.trainer.utils import utils -from kubernetes import client, config, watch +from kubeflow.trainer.types.backends import BackendConfig +from kubeflow.trainer.backends import TRAINER_BACKEND_REGISTRY logger = logging.getLogger(__name__) -class TrainerClient: - def __init__( - self, - config_file: Optional[str] = None, - context: Optional[str] = None, - client_configuration: Optional[client.Configuration] = None, - namespace: Optional[str] = None, - ): - """TrainerClient constructor. Configure logging in your application - as follows to see detailed information from the TrainerClient APIs: - .. code-block:: python - import logging - logging.basicConfig() - log = logging.getLogger("kubeflow.trainer.api.trainer_client") - log.setLevel(logging.DEBUG) +class TrainerClient(object): - Args: - config_file: Path to the kube-config file. Defaults to ~/.kube/config. - context: Set the active context. Defaults to current_context from the kube-config. - client_configuration: Client configuration for cluster authentication. - You have to provide valid configuration with Bearer token or - with username and password. You can find an example here: - https://github.com/kubernetes-client/python/blob/67f9c7a97081b4526470cad53576bc3b71fa6fcc/examples/remote_cluster.py#L31 - namespace: Target Kubernetes namespace. If SDK runs outside of Kubernetes cluster it - takes the namespace from the kube-config context. If SDK runs inside - the Kubernetes cluster it takes namespace from the - `/var/run/secrets/kubernetes.io/serviceaccount/namespace` file. By default it - uses the `default` namespace. + def __init__(self, backend_type: Optional[str] = "kubernetes", backend_config: Optional[BackendConfig] = None): """ - - if namespace is None: - namespace = utils.get_default_target_namespace(context) - - # If client configuration is not set, use kube-config to access Kubernetes APIs. - if client_configuration is None: - # Load kube-config or in-cluster config. - if config_file or not utils.is_running_in_k8s(): - config.load_kube_config(config_file=config_file, context=context) - else: - config.load_incluster_config() - - k8s_client = client.ApiClient(client_configuration) - self.custom_api = client.CustomObjectsApi(k8s_client) - self.core_api = client.CoreV1Api(k8s_client) - - self.namespace = namespace - - def list_runtimes(self) -> List[types.Runtime]: - """List of the available Runtimes. - - Returns: - List[Runtime]: List of available training runtimes. - If no runtimes exist, an empty list is returned. - - Raises: - TimeoutError: Timeout to list Runtimes. - RuntimeError: Failed to list Runtimes. - """ - - result = [] - try: - thread = self.custom_api.list_cluster_custom_object( - constants.GROUP, - constants.VERSION, - constants.CLUSTER_TRAINING_RUNTIME_PLURAL, - async_req=True, - ) - - runtime_list = models.TrainerV1alpha1ClusterTrainingRuntimeList.from_dict( - thread.get(constants.DEFAULT_TIMEOUT) - ) - - if not runtime_list: - return result - - for runtime in runtime_list.items: - result.append(self.__get_runtime_from_crd(runtime)) - - except multiprocessing.TimeoutError: - raise TimeoutError( - f"Timeout to list {constants.CLUSTER_TRAINING_RUNTIME_KIND}s " - f"in namespace: {self.namespace}" - ) - except Exception: - raise RuntimeError( - f"Failed to list {constants.CLUSTER_TRAINING_RUNTIME_KIND}s " - f"in namespace: {self.namespace}" - ) - - return result - - def get_runtime(self, name: str) -> types.Runtime: - """Get the the Runtime object""" - - try: - thread = self.custom_api.get_cluster_custom_object( - constants.GROUP, - constants.VERSION, - constants.CLUSTER_TRAINING_RUNTIME_PLURAL, - name, - async_req=True, - ) - - runtime = models.TrainerV1alpha1ClusterTrainingRuntime.from_dict( - thread.get(constants.DEFAULT_TIMEOUT) # type: ignore - ) - - except multiprocessing.TimeoutError: - raise TimeoutError( - f"Timeout to get {constants.CLUSTER_TRAINING_RUNTIME_PLURAL}: " - f"{self.namespace}/{name}" - ) - except Exception: - raise RuntimeError( - f"Failed to get {constants.CLUSTER_TRAINING_RUNTIME_PLURAL}: " - f"{self.namespace}/{name}" - ) - - return self.__get_runtime_from_crd(runtime) # type: ignore - - def train( - self, - runtime: types.Runtime = types.DEFAULT_RUNTIME, - initializer: Optional[types.Initializer] = None, - trainer: Optional[types.CustomTrainer] = None, - ) -> str: + Initialize a trainer client. + backend_type: name of the backend to be used. default is kubernetes. + backend_config: backend configuration. default is None. + returns: None """ - Create the TrainJob. You can configure these types of training task: + backend = self.__init_backend(backend_type, backend_config) + self.__backend = backend - - Custom Training Task: Training with a self-contained function that encapsulates - the entire model training process, e.g. `CustomTrainer`. + def __init_backend(self, backendtype: str, backendconfig: BackendConfig): + backend = TRAINER_BACKEND_REGISTRY.get(backendtype.lower()) + if not backend: + raise ValueError("Unknown backend type '{}'".format(backendtype)) + # load the backend class + backend_cls = backend.get("backend_cls") + # check if backend configuration is present + if not backendconfig: + backendconfig = backend.get("config_cls")() + # initialize the backend class with the user provided config + return backend_cls(cfg=backendconfig) - Args: - runtime (`types.Runtime`): Reference to one of existing Runtimes. - initializer (`Optional[types.Initializer]`): - Configuration for the dataset and model initializers. - trainer (`Optional[types.CustomTrainer]`): - Configuration for Custom Training Task. - - Returns: - str: The unique name of the TrainJob that has been generated. - - Raises: - ValueError: Input arguments are invalid. - TimeoutError: Timeout to create TrainJobs. - RuntimeError: Failed to create TrainJobs. - """ - - # Generate unique name for the TrainJob. - # TODO (andreyvelich): Discuss this TrainJob name generation. - train_job_name = random.choice(string.ascii_lowercase) + uuid.uuid4().hex[:11] - - # Build the Trainer. - trainer_crd = models.TrainerV1alpha1Trainer() - - if trainer: - # If users choose to use a custom training function. - if isinstance(trainer, types.CustomTrainer): - trainer_crd = utils.get_trainer_crd_from_custom_trainer( - trainer, runtime - ) - - # If users choose to use a builtin trainer for post-training. - elif isinstance(trainer, types.BuiltinTrainer): - trainer_crd = utils.get_trainer_crd_from_builtin_trainer( - trainer, initializer - ) - - else: - raise ValueError( - f"The trainer type {type(trainer)} is not supported. " - "Please use CustomTrainer or BuiltinTrainer." - ) - - train_job = models.TrainerV1alpha1TrainJob( - apiVersion=constants.API_VERSION, - kind=constants.TRAINJOB_KIND, - metadata=models.IoK8sApimachineryPkgApisMetaV1ObjectMeta( - name=train_job_name - ), - spec=models.TrainerV1alpha1TrainJobSpec( - runtimeRef=models.TrainerV1alpha1RuntimeRef(name=runtime.name), - trainer=( - trainer_crd - if trainer_crd != models.TrainerV1alpha1Trainer() - else None - ), - initializer=( - models.TrainerV1alpha1Initializer( - dataset=utils.get_dataset_initializer(initializer.dataset), - model=utils.get_model_initializer(initializer.model), - ) - if isinstance(initializer, types.Initializer) - else None - ), - ), - ) - - # Create the TrainJob. - try: - self.custom_api.create_namespaced_custom_object( - constants.GROUP, - constants.VERSION, - self.namespace, - constants.TRAINJOB_PLURAL, - train_job.to_dict(), - ) - except multiprocessing.TimeoutError: - raise TimeoutError( - f"Timeout to create {constants.TRAINJOB_KIND}: {self.namespace}/{train_job_name}" - ) - except Exception: - raise RuntimeError( - f"Failed to create {constants.TRAINJOB_KIND}: {self.namespace}/{train_job_name}" - ) - - logger.debug( - f"{constants.TRAINJOB_KIND} {self.namespace}/{train_job_name} has been created" - ) - - return train_job_name - - def list_jobs( - self, runtime: Optional[types.Runtime] = None - ) -> List[types.TrainJob]: - """List of all TrainJobs. - - Returns: - List[TrainerV1alpha1TrainJob]: List of created TrainJobs. - If no TrainJob exist, an empty list is returned. - - Raises: - TimeoutError: Timeout to list TrainJobs. - RuntimeError: Failed to list TrainJobs. - """ + def list_runtimes(self): + return self.__backend.list_runtimes() - result = [] - try: - thread = self.custom_api.list_namespaced_custom_object( - constants.GROUP, - constants.VERSION, - self.namespace, - constants.TRAINJOB_PLURAL, - async_req=True, - ) + def get_runtime(self, name: str): + return self.__backend.get_runtime(name=name) - trainjob_list = models.TrainerV1alpha1TrainJobList.from_dict( - thread.get(constants.DEFAULT_TIMEOUT) - ) + def list_jobs(self, runtime: Optional[types.Runtime] = None): + return self.__backend.list_jobs(runtime=runtime) - if not trainjob_list: - return result - - for trainjob in trainjob_list.items: - # If runtime object is set, we check the TrainJob's runtime reference. - if ( - runtime is not None - and trainjob.spec - and trainjob.spec.runtime_ref - and trainjob.spec.runtime_ref.name != runtime.name - ): - continue - - result.append(self.__get_trainjob_from_crd(trainjob)) - - except multiprocessing.TimeoutError: - raise TimeoutError( - f"Timeout to list {constants.TRAINJOB_KIND}s in namespace: {self.namespace}" - ) - except Exception: - raise RuntimeError( - f"Failed to list {constants.TRAINJOB_KIND}s in namespace: {self.namespace}" - ) - - return result - - def get_job(self, name: str) -> types.TrainJob: - """Get the TrainJob object""" - - try: - thread = self.custom_api.get_namespaced_custom_object( - constants.GROUP, - constants.VERSION, - self.namespace, - constants.TRAINJOB_PLURAL, - name, - async_req=True, - ) - - trainjob = models.TrainerV1alpha1TrainJob.from_dict( - thread.get(constants.DEFAULT_TIMEOUT) # type: ignore - ) - - except multiprocessing.TimeoutError: - raise TimeoutError( - f"Timeout to get {constants.TRAINJOB_KIND}: {self.namespace}/{name}" - ) - except Exception: - raise RuntimeError( - f"Failed to get {constants.TRAINJOB_KIND}: {self.namespace}/{name}" - ) - - return self.__get_trainjob_from_crd(trainjob) # type: ignore - - def get_job_logs( - self, - name: str, - follow: Optional[bool] = False, - step: str = constants.NODE, - node_rank: int = 0, - ) -> Dict[str, str]: - """Get the logs from TrainJob""" - - # Get the TrainJob Pod name. - pod_name = None - for c in self.get_job(name).steps: - if c.status != constants.POD_PENDING: - if c.name == step or c.name == f"{step}-{node_rank}": - pod_name = c.pod_name - if pod_name is None: - return {} - - # Dict where key is the Pod type and value is the Pod logs. - logs_dict = {} - - # TODO (andreyvelich): Potentially, refactor this. - # Support logging of multiple Pods. - # TODO (andreyvelich): Currently, follow is supported only for node container. - if follow and step == constants.NODE: - log_streams = [] - log_streams.append( - watch.Watch().stream( - self.core_api.read_namespaced_pod_log, - name=pod_name, - namespace=self.namespace, - container=constants.NODE, - ) - ) - finished = [False] * len(log_streams) - - # Create thread and queue per stream, for non-blocking iteration. - log_queue_pool = utils.get_log_queue_pool(log_streams) - - # Iterate over every watching pods' log queue - while True: - for index, log_queue in enumerate(log_queue_pool): - if all(finished): - break - if finished[index]: - continue - # grouping the every 50 log lines of the same pod. - for _ in range(50): - try: - logline = log_queue.get(timeout=1) - if logline is None: - finished[index] = True - break - # Print logs to the StdOut and update results dict. - print(f"[{step}-{node_rank}]: {logline}") - logs_dict[f"{step}-{node_rank}"] = ( - logs_dict.get(f"{step}-{node_rank}", "") - + logline - + "\n" - ) - except queue.Empty: - break - if all(finished): - return logs_dict - - try: - if step == constants.DATASET_INITIALIZER: - logs_dict[constants.DATASET_INITIALIZER] = ( - self.core_api.read_namespaced_pod_log( - name=pod_name, - namespace=self.namespace, - container=constants.DATASET_INITIALIZER, - ) - ) - elif step == constants.MODEL_INITIALIZER: - logs_dict[constants.MODEL_INITIALIZER] = ( - self.core_api.read_namespaced_pod_log( - name=pod_name, - namespace=self.namespace, - container=constants.MODEL_INITIALIZER, - ) - ) - else: - logs_dict[f"{step}-{node_rank}"] = ( - self.core_api.read_namespaced_pod_log( - name=pod_name, - namespace=self.namespace, - container=constants.NODE, - ) - ) - - except Exception: - raise RuntimeError( - f"Failed to read logs for the pod {self.namespace}/{pod_name}" - ) - - return logs_dict + def get_job(self, name: str): + return self.__backend.get_job(name=name) def delete_job(self, name: str): - """Delete the TrainJob. - - Args: - name: Name of the TrainJob. + return self.__backend.delete_job(name=name) - Raises: - TimeoutError: Timeout to delete TrainJob. - RuntimeError: Failed to delete TrainJob. - """ - - try: - self.custom_api.delete_namespaced_custom_object( - constants.GROUP, - constants.VERSION, - self.namespace, - constants.TRAINJOB_PLURAL, - name=name, - ) - except multiprocessing.TimeoutError: - raise TimeoutError( - f"Timeout to delete {constants.TRAINJOB_KIND}: {self.namespace}/{name}" - ) - except Exception: - raise RuntimeError( - f"Failed to delete {constants.TRAINJOB_KIND}: {self.namespace}/{name}" - ) - - logger.debug( - f"{constants.TRAINJOB_KIND} {self.namespace}/{name} has been deleted" - ) - - def __get_runtime_from_crd( - self, - runtime_crd: models.TrainerV1alpha1ClusterTrainingRuntime, - ) -> types.Runtime: - - if not ( - runtime_crd.metadata - and runtime_crd.metadata.name - and runtime_crd.spec - and runtime_crd.spec.ml_policy - and runtime_crd.spec.template.spec - and runtime_crd.spec.template.spec.replicated_jobs + def get_job_logs(self, + name: str, + follow: Optional[bool] = False, + step: str = constants.NODE, + node_rank: int = 0, ): - raise Exception(f"ClusterTrainingRuntime CRD is invalid: {runtime_crd}") + return self.__backend.get_job_logs(name=name, follow=follow, step=step, node_rank=node_rank) - return types.Runtime( - name=runtime_crd.metadata.name, - trainer=utils.get_runtime_trainer( - runtime_crd.spec.template.spec.replicated_jobs, - runtime_crd.spec.ml_policy, - runtime_crd.metadata, - ), - ) - - def __get_trainjob_from_crd( - self, - trainjob_crd: models.TrainerV1alpha1TrainJob, - ) -> types.TrainJob: - - if not ( - trainjob_crd.metadata - and trainjob_crd.metadata.name - and trainjob_crd.metadata.namespace - and trainjob_crd.spec - and trainjob_crd.metadata.creation_timestamp + def train(self, + runtime: types.Runtime = types.DEFAULT_RUNTIME, + initializer: Optional[types.Initializer] = None, + trainer: Optional[types.CustomTrainer] = None, ): - raise Exception(f"TrainJob CRD is invalid: {trainjob_crd}") - - name = trainjob_crd.metadata.name - namespace = trainjob_crd.metadata.namespace - - # Construct the TrainJob from the CRD. - trainjob = types.TrainJob( - name=name, - creation_timestamp=trainjob_crd.metadata.creation_timestamp, - runtime=self.get_runtime(trainjob_crd.spec.runtime_ref.name), - steps=[], - ) - - # Add the TrainJob status. - # TODO (andreyvelich): Discuss how we should show TrainJob status to SDK users. - # The TrainJob exists at that stage so its status can safely default to Created - trainjob.status = constants.TRAINJOB_CREATED - # Then it can be read from the TrainJob conditions if any - if trainjob_crd.status and trainjob_crd.status.conditions: - for c in trainjob_crd.status.conditions: - if c.type == "Complete" and c.status == "True": - trainjob.status = "Succeeded" - elif c.type == "Failed" and c.status == "True": - trainjob.status = "Failed" - - # Select Pods created by the appropriate JobSet. It checks the following ReplicatedJob.name: - # dataset-initializer, model-initializer, launcher, node. - label_selector = "{}={},{} in ({}, {}, {}, {})".format( - constants.JOBSET_NAME_LABEL, - name, - constants.JOBSET_RJOB_NAME_LABEL, - constants.DATASET_INITIALIZER, - constants.MODEL_INITIALIZER, - constants.LAUNCHER, - constants.NODE, - ) - - # Add the TrainJob components, e.g. trainer nodes and initializer. - try: - response = self.core_api.list_namespaced_pod( - namespace, - label_selector=label_selector, - async_req=True, - ).get(constants.DEFAULT_TIMEOUT) - - # Convert Pod to the correct format. - pod_list = models.IoK8sApiCoreV1PodList.from_dict(response.to_dict()) - if not pod_list: - return trainjob - - for pod in pod_list.items: - # Pod must have labels to detect the TrainJob step. - # Every Pod always has a single TrainJob step. - if not ( - pod.metadata - and pod.metadata.name - and pod.metadata.labels - and pod.spec - ): - raise Exception(f"TrainJob Pod is invalid: {pod}") - - # Get the Initializer step. - if pod.metadata.labels[constants.JOBSET_RJOB_NAME_LABEL] in { - constants.DATASET_INITIALIZER, - constants.MODEL_INITIALIZER, - }: - step = utils.get_trainjob_initializer_step( - pod.metadata.name, - pod.spec, - pod.status, - ) - # Get the Node step. - elif pod.metadata.labels[constants.JOBSET_RJOB_NAME_LABEL] in { - constants.LAUNCHER, - constants.NODE, - }: - step = utils.get_trainjob_node_step( - pod.metadata.name, - pod.spec, - pod.status, - trainjob.runtime, - pod.metadata.labels[constants.JOBSET_RJOB_NAME_LABEL], - int(pod.metadata.labels[constants.JOB_INDEX_LABEL]), - ) - - trainjob.steps.append(step) - except multiprocessing.TimeoutError: - raise TimeoutError( - f"Timeout to list {constants.TRAINJOB_KIND}'s steps: {namespace}/{name}" - ) - except Exception: - raise RuntimeError( - f"Failed to list {constants.TRAINJOB_KIND}'s steps: {namespace}/{name}" - ) + # Generate unique name for the TrainJob. + # TODO (andreyvelich): Discuss this TrainJob name generation. + train_job_name = random.choice(string.ascii_lowercase) + uuid.uuid4().hex[:11] - return trainjob + self.__backend.train(train_job_name=train_job_name, runtime=runtime, initializer=initializer, trainer=trainer) diff --git a/python/kubeflow/trainer/backends/__init__.py b/python/kubeflow/trainer/backends/__init__.py new file mode 100644 index 000000000..cbc083f4c --- /dev/null +++ b/python/kubeflow/trainer/backends/__init__.py @@ -0,0 +1,28 @@ +# Copyright 2025 The Kubeflow Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from kubeflow.trainer.backends.k8s import K8SBackend +from kubeflow.trainer.backends.local_process import LocalProcessBackend +from kubeflow.trainer.types.backends import K8SBackendConfig, LocalProcessBackendConfig + +TRAINER_BACKEND_REGISTRY = { + "kubernetes": { + "backend_cls": K8SBackend, + "config_cls": K8SBackendConfig, + }, + "local": { + "backend_cls": LocalProcessBackend, + "config_cls": LocalProcessBackendConfig, + } +} \ No newline at end of file diff --git a/python/kubeflow/trainer/backends/base.py b/python/kubeflow/trainer/backends/base.py new file mode 100644 index 000000000..3c70fe405 --- /dev/null +++ b/python/kubeflow/trainer/backends/base.py @@ -0,0 +1,62 @@ +# Copyright 2025 The Kubeflow Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import abc + +from typing import Dict, List, Optional +from kubeflow.trainer.constants import constants +from kubeflow.trainer.types import types + + +class TrainingBackend(abc.ABC): + + @abc.abstractmethod + def list_runtimes(self) -> List[types.Runtime]: + raise NotImplementedError() + + @abc.abstractmethod + def get_runtime(self, name: str) -> Optional[types.Runtime]: + raise NotImplementedError() + + @abc.abstractmethod + def train(self, + train_job_name: str, + runtime: types.Runtime, + initializer: Optional[types.Initializer] = None, + trainer: Optional[types.Trainer] = None, + ) -> str: + raise NotImplementedError() + + @abc.abstractmethod + def list_jobs( + self, runtime: Optional[types.Runtime] = None + ) -> List[types.TrainJob]: + raise NotImplementedError() + + @abc.abstractmethod + def get_job(self, name: str) -> Optional[types.TrainJob]: + raise NotImplementedError() + + @abc.abstractmethod + def get_job_logs(self, + name: str, + follow: Optional[bool] = False, + step: str = constants.NODE, + node_rank: int = 0, + ) -> Dict[str, str]: + raise NotImplementedError() + + @abc.abstractmethod + def delete_job(self, name: str) -> None: + raise NotImplementedError() diff --git a/python/kubeflow/trainer/backends/k8s.py b/python/kubeflow/trainer/backends/k8s.py new file mode 100644 index 000000000..14a324440 --- /dev/null +++ b/python/kubeflow/trainer/backends/k8s.py @@ -0,0 +1,572 @@ +# Copyright 2025 The Kubeflow Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import multiprocessing +import queue +import random +import string +import uuid + +from typing import Dict, List, Optional + +from kubernetes import client, watch +from kubernetes import config as k8s_config +from kubeflow_trainer_api import models + +from kubeflow.trainer.constants import constants +from kubeflow.trainer.types import types +from kubeflow.trainer.types.backends import K8SBackendConfig +from kubeflow.trainer.utils import utils +from kubeflow.trainer.backends import base + + +logger = logging.getLogger(__name__) + + +class K8SBackend(base.TrainingBackend): + + def __init__(self, + cfg: K8SBackendConfig, + ): + if cfg.namespace is None: + cfg.namespace = utils.get_default_target_namespace(cfg.context) + + # If client configuration is not set, use kube-config to access Kubernetes APIs. + if cfg.client_configuration is None: + # Load kube-config or in-cluster config. + if cfg.config_file or not utils.is_running_in_k8s(): + k8s_config.load_kube_config(config_file=cfg.config_file, context=cfg.context) + else: + k8s_config.load_incluster_config() + + k8s_client = client.ApiClient(cfg.client_configuration) + self.custom_api = client.CustomObjectsApi(k8s_client) + self.core_api = client.CoreV1Api(k8s_client) + + self.namespace = cfg.namespace + + def list_runtimes(self) -> List[types.Runtime]: + """List of the available Runtimes. + + Returns: + List[Runtime]: List of available training runtimes. + If no runtimes exist, an empty list is returned. + + Raises: + TimeoutError: Timeout to list Runtimes. + RuntimeError: Failed to list Runtimes. + """ + + result = [] + try: + thread = self.custom_api.list_cluster_custom_object( + constants.GROUP, + constants.VERSION, + constants.CLUSTER_TRAINING_RUNTIME_PLURAL, + async_req=True, + ) + + runtime_list = models.TrainerV1alpha1ClusterTrainingRuntimeList.from_dict( + thread.get(constants.DEFAULT_TIMEOUT) + ) + + if not runtime_list: + return result + + for runtime in runtime_list.items: + result.append(self.__get_runtime_from_crd(runtime)) + + except multiprocessing.TimeoutError: + raise TimeoutError( + f"Timeout to list {constants.CLUSTER_TRAINING_RUNTIME_KIND}s " + f"in namespace: {self.namespace}" + ) + except Exception: + raise RuntimeError( + f"Failed to list {constants.CLUSTER_TRAINING_RUNTIME_KIND}s " + f"in namespace: {self.namespace}" + ) + + return result + + def get_runtime(self, name: str) -> Optional[types.Runtime]: + """Get the the Runtime object""" + + try: + thread = self.custom_api.get_cluster_custom_object( + constants.GROUP, + constants.VERSION, + constants.CLUSTER_TRAINING_RUNTIME_PLURAL, + name, + async_req=True, + ) + + runtime = models.TrainerV1alpha1ClusterTrainingRuntime.from_dict( + thread.get(constants.DEFAULT_TIMEOUT) # type: ignore + ) + + except multiprocessing.TimeoutError: + raise TimeoutError( + f"Timeout to get {constants.CLUSTER_TRAINING_RUNTIME_PLURAL}: " + f"{self.namespace}/{name}" + ) + except Exception: + raise RuntimeError( + f"Failed to get {constants.CLUSTER_TRAINING_RUNTIME_PLURAL}: " + f"{self.namespace}/{name}" + ) + + return self.__get_runtime_from_crd(runtime) # type: ignore + + def train(self, train_job_name: str, + runtime: types.Runtime, + initializer: Optional[types.Initializer] = None, + trainer: Optional[types.Trainer] = None) -> str: + """ + Create the TrainJob. You can configure these types of training task: + + - Custom Training Task: Training with a self-contained function that encapsulates + the entire model training process, e.g. `CustomTrainer`. + + Args: + train_job_name: The name of the training job. + runtime (`types.Runtime`): Reference to one of existing Runtimes. + initializer (`Optional[types.Initializer]`): + Configuration for the dataset and model initializers. + trainer (`Optional[types.CustomTrainer]`): + Configuration for Custom Training Task. + + Returns: + str: The unique name of the TrainJob that has been generated. + + Raises: + ValueError: Input arguments are invalid. + TimeoutError: Timeout to create TrainJobs. + RuntimeError: Failed to create TrainJobs. + """ + + # Build the Trainer. + trainer_crd = models.TrainerV1alpha1Trainer() + + if trainer: + # If users choose to use a custom training function. + if isinstance(trainer, types.CustomTrainer): + trainer_crd = utils.get_trainer_crd_from_custom_trainer( + trainer, runtime + ) + + # If users choose to use a builtin trainer for post-training. + elif isinstance(trainer, types.BuiltinTrainer): + trainer_crd = utils.get_trainer_crd_from_builtin_trainer( + trainer, initializer + ) + + else: + raise ValueError( + f"The trainer type {type(trainer)} is not supported. " + "Please use CustomTrainer or BuiltinTrainer." + ) + + train_job = models.TrainerV1alpha1TrainJob( + apiVersion=constants.API_VERSION, + kind=constants.TRAINJOB_KIND, + metadata=models.IoK8sApimachineryPkgApisMetaV1ObjectMeta( + name=train_job_name + ), + spec=models.TrainerV1alpha1TrainJobSpec( + runtimeRef=models.TrainerV1alpha1RuntimeRef(name=runtime.name), + trainer=( + trainer_crd + if trainer_crd != models.TrainerV1alpha1Trainer() + else None + ), + initializer=( + models.TrainerV1alpha1Initializer( + dataset=utils.get_dataset_initializer(initializer.dataset), + model=utils.get_model_initializer(initializer.model), + ) + if isinstance(initializer, types.Initializer) + else None + ), + ), + ) + + # Create the TrainJob. + try: + self.custom_api.create_namespaced_custom_object( + constants.GROUP, + constants.VERSION, + self.namespace, + constants.TRAINJOB_PLURAL, + train_job.to_dict(), + ) + except multiprocessing.TimeoutError: + raise TimeoutError( + f"Timeout to create {constants.TRAINJOB_KIND}: {self.namespace}/{train_job_name}" + ) + except Exception: + raise RuntimeError( + f"Failed to create {constants.TRAINJOB_KIND}: {self.namespace}/{train_job_name}" + ) + + logger.debug( + f"{constants.TRAINJOB_KIND} {self.namespace}/{train_job_name} has been created" + ) + + return train_job_name + + def list_jobs(self, runtime: Optional[types.Runtime] = None) -> List[types.TrainJob]: + """List of all TrainJobs. + + Returns: + List[TrainerV1alpha1TrainJob]: List of created TrainJobs. + If no TrainJob exist, an empty list is returned. + + Raises: + TimeoutError: Timeout to list TrainJobs. + RuntimeError: Failed to list TrainJobs. + """ + + result = [] + try: + thread = self.custom_api.list_namespaced_custom_object( + constants.GROUP, + constants.VERSION, + self.namespace, + constants.TRAINJOB_PLURAL, + async_req=True, + ) + + trainjob_list = models.TrainerV1alpha1TrainJobList.from_dict( + thread.get(constants.DEFAULT_TIMEOUT) + ) + + if not trainjob_list: + return result + + for trainjob in trainjob_list.items: + # If runtime object is set, we check the TrainJob's runtime reference. + if ( + runtime is not None + and trainjob.spec + and trainjob.spec.runtime_ref + and trainjob.spec.runtime_ref.name != runtime.name + ): + continue + + result.append(self.__get_trainjob_from_crd(trainjob)) + + except multiprocessing.TimeoutError: + raise TimeoutError( + f"Timeout to list {constants.TRAINJOB_KIND}s in namespace: {self.namespace}" + ) + except Exception: + raise RuntimeError( + f"Failed to list {constants.TRAINJOB_KIND}s in namespace: {self.namespace}" + ) + + return result + + def get_job(self, name: str) -> Optional[types.TrainJob]: + """Get the TrainJob object""" + + try: + thread = self.custom_api.get_namespaced_custom_object( + constants.GROUP, + constants.VERSION, + self.namespace, + constants.TRAINJOB_PLURAL, + name, + async_req=True, + ) + + trainjob = models.TrainerV1alpha1TrainJob.from_dict( + thread.get(constants.DEFAULT_TIMEOUT) # type: ignore + ) + + except multiprocessing.TimeoutError: + raise TimeoutError( + f"Timeout to get {constants.TRAINJOB_KIND}: {self.namespace}/{name}" + ) + except Exception: + raise RuntimeError( + f"Failed to get {constants.TRAINJOB_KIND}: {self.namespace}/{name}" + ) + + return self.__get_trainjob_from_crd(trainjob) # type: ignore + + def get_job_logs(self, + name: str, + follow: Optional[bool] = False, + step: str = constants.NODE, + node_rank: int = 0) -> Dict[str, str]: + """Get the logs from TrainJob""" + + # Get the TrainJob Pod name. + pod_name = None + for c in self.get_job(name).steps: + if c.status != constants.POD_PENDING: + if c.name == step or c.name == f"{step}-{node_rank}": + pod_name = c.pod_name + if pod_name is None: + return {} + + # Dict where key is the Pod type and value is the Pod logs. + logs_dict = {} + + # TODO (andreyvelich): Potentially, refactor this. + # Support logging of multiple Pods. + # TODO (andreyvelich): Currently, follow is supported only for node container. + if follow and step == constants.NODE: + log_streams = [] + log_streams.append( + watch.Watch().stream( + self.core_api.read_namespaced_pod_log, + name=pod_name, + namespace=self.namespace, + container=constants.NODE, + ) + ) + finished = [False] * len(log_streams) + + # Create thread and queue per stream, for non-blocking iteration. + log_queue_pool = utils.get_log_queue_pool(log_streams) + + # Iterate over every watching pods' log queue + while True: + for index, log_queue in enumerate(log_queue_pool): + if all(finished): + break + if finished[index]: + continue + # grouping the every 50 log lines of the same pod. + for _ in range(50): + try: + logline = log_queue.get(timeout=1) + if logline is None: + finished[index] = True + break + # Print logs to the StdOut and update results dict. + print(f"[{step}-{node_rank}]: {logline}") + logs_dict[f"{step}-{node_rank}"] = ( + logs_dict.get(f"{step}-{node_rank}", "") + + logline + + "\n" + ) + except queue.Empty: + break + if all(finished): + return logs_dict + + try: + if step == constants.DATASET_INITIALIZER: + logs_dict[constants.DATASET_INITIALIZER] = ( + self.core_api.read_namespaced_pod_log( + name=pod_name, + namespace=self.namespace, + container=constants.DATASET_INITIALIZER, + ) + ) + elif step == constants.MODEL_INITIALIZER: + logs_dict[constants.MODEL_INITIALIZER] = ( + self.core_api.read_namespaced_pod_log( + name=pod_name, + namespace=self.namespace, + container=constants.MODEL_INITIALIZER, + ) + ) + else: + logs_dict[f"{step}-{node_rank}"] = ( + self.core_api.read_namespaced_pod_log( + name=pod_name, + namespace=self.namespace, + container=constants.NODE, + ) + ) + + except Exception: + raise RuntimeError( + f"Failed to read logs for the pod {self.namespace}/{pod_name}" + ) + + return logs_dict + + + def delete_job(self, name: str) -> None: + """Delete the TrainJob. + + Args: + name: Name of the TrainJob. + + Raises: + TimeoutError: Timeout to delete TrainJob. + RuntimeError: Failed to delete TrainJob. + """ + + try: + self.custom_api.delete_namespaced_custom_object( + constants.GROUP, + constants.VERSION, + self.namespace, + constants.TRAINJOB_PLURAL, + name=name, + ) + except multiprocessing.TimeoutError: + raise TimeoutError( + f"Timeout to delete {constants.TRAINJOB_KIND}: {self.namespace}/{name}" + ) + except Exception: + raise RuntimeError( + f"Failed to delete {constants.TRAINJOB_KIND}: {self.namespace}/{name}" + ) + + logger.debug( + f"{constants.TRAINJOB_KIND} {self.namespace}/{name} has been deleted" + ) + + def __get_runtime_from_crd( + self, + runtime_crd: models.TrainerV1alpha1ClusterTrainingRuntime, + ) -> types.Runtime: + if not ( + runtime_crd.metadata + and runtime_crd.metadata.name + and runtime_crd.spec + and runtime_crd.spec.ml_policy + and runtime_crd.spec.template.spec + and runtime_crd.spec.template.spec.replicated_jobs + ): + raise Exception(f"ClusterTrainingRuntime CRD is invalid: {runtime_crd}") + + return types.Runtime( + name=runtime_crd.metadata.name, + trainer=utils.get_runtime_trainer( + runtime_crd.spec.template.spec.replicated_jobs, + runtime_crd.spec.ml_policy, + runtime_crd.metadata, + ), + ) + + def __get_trainjob_from_crd( + self, + trainjob_crd: models.TrainerV1alpha1TrainJob, + ) -> types.TrainJob: + if not ( + trainjob_crd.metadata + and trainjob_crd.metadata.name + and trainjob_crd.metadata.namespace + and trainjob_crd.spec + and trainjob_crd.metadata.creation_timestamp + ): + raise Exception(f"TrainJob CRD is invalid: {trainjob_crd}") + + name = trainjob_crd.metadata.name + namespace = trainjob_crd.metadata.namespace + + # Construct the TrainJob from the CRD. + trainjob = types.TrainJob( + name=name, + creation_timestamp=trainjob_crd.metadata.creation_timestamp, + runtime=self.get_runtime(trainjob_crd.spec.runtime_ref.name), + steps=[], + ) + + # Add the TrainJob status. + # TODO (andreyvelich): Discuss how we should show TrainJob status to SDK users. + # The TrainJob exists at that stage so its status can safely default to Created + trainjob.status = constants.TRAINJOB_CREATED + # Then it can be read from the TrainJob conditions if any + if trainjob_crd.status and trainjob_crd.status.conditions: + for c in trainjob_crd.status.conditions: + if c.type == "Complete" and c.status == "True": + trainjob.status = "Succeeded" + elif c.type == "Failed" and c.status == "True": + trainjob.status = "Failed" + + # Select Pods created by the appropriate JobSet. It checks the following ReplicatedJob.name: + # dataset-initializer, model-initializer, launcher, node. + label_selector = "{}={},{} in ({}, {}, {}, {})".format( + constants.JOBSET_NAME_LABEL, + name, + constants.JOBSET_RJOB_NAME_LABEL, + constants.DATASET_INITIALIZER, + constants.MODEL_INITIALIZER, + constants.LAUNCHER, + constants.NODE, + ) + + # Add the TrainJob components, e.g. trainer nodes and initializer. + try: + response = self.core_api.list_namespaced_pod( + namespace, + label_selector=label_selector, + async_req=True, + ).get(constants.DEFAULT_TIMEOUT) + + # Convert Pod to the correct format. + pod_list = models.IoK8sApiCoreV1PodList.from_dict(response.to_dict()) + if not pod_list: + return trainjob + + for pod in pod_list.items: + # Pod must have labels to detect the TrainJob step. + # Every Pod always has a single TrainJob step. + if not ( + pod.metadata + and pod.metadata.name + and pod.metadata.labels + and pod.spec + ): + raise Exception(f"TrainJob Pod is invalid: {pod}") + + # Get the Initializer step. + if pod.metadata.labels[constants.JOBSET_RJOB_NAME_LABEL] in { + constants.DATASET_INITIALIZER, + constants.MODEL_INITIALIZER, + }: + step = utils.get_trainjob_initializer_step( + pod.metadata.name, + pod.spec, + pod.status, + ) + # Get the Node step. + elif pod.metadata.labels[constants.JOBSET_RJOB_NAME_LABEL] in { + constants.LAUNCHER, + constants.NODE, + }: + step = utils.get_trainjob_node_step( + pod.metadata.name, + pod.spec, + pod.status, + trainjob.runtime, + pod.metadata.labels[constants.JOBSET_RJOB_NAME_LABEL], + int(pod.metadata.labels[constants.JOB_INDEX_LABEL]), + ) + + trainjob.steps.append(step) + except multiprocessing.TimeoutError: + raise TimeoutError( + f"Timeout to list {constants.TRAINJOB_KIND}'s steps: {namespace}/{name}" + ) + except Exception: + raise RuntimeError( + f"Failed to list {constants.TRAINJOB_KIND}'s steps: {namespace}/{name}" + ) + + return trainjob + + + + diff --git a/python/kubeflow/trainer/backends/local_process.py b/python/kubeflow/trainer/backends/local_process.py new file mode 100644 index 000000000..b9aa96004 --- /dev/null +++ b/python/kubeflow/trainer/backends/local_process.py @@ -0,0 +1,190 @@ +# Copyright 2025 The Kubeflow Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import datetime +import logging +import tempfile +import venv +from typing import List, Optional + +from kubeflow.trainer.constants import constants +from kubeflow.trainer.types import types, local as local_types +from kubeflow.trainer.types.backends import LocalProcessBackendConfig +from kubeflow.trainer.utils import utils, local as local_utils +from kubeflow.trainer.backends import base +from kubeflow.trainer.local import runtimes as local_runtimes +from kubeflow.trainer.local import job + + +logger = logging.getLogger(__name__) + + +class LocalProcessBackend(base.TrainingBackend): + + def __init__(self, + cfg: LocalProcessBackendConfig, + ): + # list of running subprocesses + self.__jobs: List[job.LocalJob] = [] + self.cfg = cfg + + + def list_runtimes(self) -> List[types.Runtime]: + """List of the available Runtimes. + + Returns: + List[Runtime]: List of available local training runtimes. + If no runtimes exist, an empty list is returned. + """ + + return local_runtimes.runtimes + + def get_runtime(self, name: str) -> Optional[types.Runtime]: + """Get the the Runtime object""" + if name not in local_runtimes: + raise ValueError(f"Runtime '{name}' not found.") + + return local_runtimes[name] + + + def train(self, train_job_name:str, + runtime: local_types.LocalRuntime, + initializer: Optional[types.Initializer] = None, + trainer: Optional[types.Trainer] = None) -> str: + """ + Create the TrainJob. You can configure these types of training task: + + - Custom Training Task: Training with a self-contained function that encapsulates + the entire model training process, e.g. `CustomTrainer`. + + Args: + train_job_name: The name of the training job. + runtime (`types.Runtime`): Reference to one of existing Runtimes. + initializer (`Optional[types.Initializer]`): + Configuration for the dataset and model initializers. + trainer (`Optional[types.CustomTrainer]`): + Configuration for Custom Training Task. + + Returns: + str: The unique name of the TrainJob that has been generated. + + Raises: + ValueError: Input arguments are invalid. + TimeoutError: Timeout to create TrainJobs. + RuntimeError: Failed to create TrainJobs. + """ + # Build the env + if not trainer: + raise ValueError("Cannot create TrainJob without a Trainer") + if isinstance(trainer, types.CustomTrainer): + trainer: types.CustomTrainer = trainer + + # create temp dir for venv + target_dir = tempfile.mkdtemp(prefix=f"{train_job_name}-") + runtime.execution_dir = target_dir + + # create venv + if runtime.create_venv: + self.__create_venv(env_dir=target_dir) + runtime.python_path = local_utils.get_venv_python_path(target_dir) + + command, args = local_utils.build_local_training_executable( + runtime, + trainer.func, + trainer.func_args, + trainer.pip_index_url, + trainer.packages_to_install, + ) + + # prepare local job + j = job.LocalJob( + name=train_job_name, + command=args, + ) + + # register job in local list + self.__jobs.append(j) + + # start job + j.start() + + return train_job_name + + def __create_venv(self, env_dir: str) -> None: + venv.create(env_dir=env_dir, with_pip=True) + + def list_jobs(self, runtime: Optional[types.Runtime] = None) -> List[local_types.LocalTrainJob]: + """List of all TrainJobs. + + Returns: + List[TrainerV1alpha1TrainJob]: List of created TrainJobs. + If no TrainJob exist, an empty list is returned. + + Raises: + TimeoutError: Timeout to list TrainJobs. + RuntimeError: Failed to list TrainJobs. + """ + + result = [local_types.LocalTrainJob(name=j.name, creation_timestamp=datetime.datetime.now(), runtime=runtime, steps=[], job=j) for j in self.__jobs] + + return result + + def get_job(self, name: str) -> Optional[local_types.LocalTrainJob]: + """Get the TrainJob object""" + j = [j for j in self.__jobs if j.name == name] + if not j: + raise ValueError("No TrainJob with name '%s'" % name) + return local_types.LocalTrainJob( + name=j[0].name, + creation_timestamp=datetime.datetime.now(), + runtime=None, steps=[], job=j[0] + ) + + def get_job_logs(self, + name: str, + follow: Optional[bool] = False, + step: str = constants.NODE, + node_rank: int = 0) -> List[str]: + """Get the logs from TrainJob""" + j = self.get_job(name=name) + lines = [] + for line in j.job.follow_logs(): + print(line) + lines.append(line) + return lines + + + + def delete_job(self, name: str) -> None: + """Delete the TrainJob. + + Args: + name: Name of the TrainJob. + + Raises: + TimeoutError: Timeout to delete TrainJob. + RuntimeError: Failed to delete TrainJob. + """ + + # delete job from registry or job list + target = [j for j in self.__jobs if j.name == name] + if not target: + + raise ValueError("No TrainJob with name '%s'" % name) + self.__jobs.remove(target[0]) + + + + + + diff --git a/python/kubeflow/trainer/constants/constants.py b/python/kubeflow/trainer/constants/constants.py index 63fca9372..a39adb299 100644 --- a/python/kubeflow/trainer/constants/constants.py +++ b/python/kubeflow/trainer/constants/constants.py @@ -125,3 +125,17 @@ # The Instruct Datasets class in torchtune TORCHTUNE_INSTRUCT_DATASET = "torchtune.datasets.instruct_dataset" + + +# local execution +## local config dir +DEFAULT_CFG_DIR = os.path.expanduser("~/.kubeflow/trainer") +# dir for storing local runtimes +DEFAULT_LOCAL_RUNTIME_DIR = f"{DEFAULT_CFG_DIR}/runtime" +# dir to create local training virtualenvs +DEFAULT_VENV_DIR = f"{DEFAULT_CFG_DIR}/venv" +# list of directories to be created at runtime +DEFAULT_CFG_SUB_DIRS = [ + "envs", + "runtime" +] \ No newline at end of file diff --git a/python/kubeflow/trainer/local/__init__.py b/python/kubeflow/trainer/local/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/python/kubeflow/trainer/local/job.py b/python/kubeflow/trainer/local/job.py new file mode 100644 index 000000000..bb83f6298 --- /dev/null +++ b/python/kubeflow/trainer/local/job.py @@ -0,0 +1,95 @@ +# Copyright 2025 The Kubeflow Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import threading +import subprocess +import logging + +logger = logging.getLogger(__name__) + + +class LocalJob(threading.Thread): + def __init__(self, name, command, dependencies=None): + super().__init__() + self.name = name + self.command = command + self.dependencies = dependencies or [] + self._stdout = "" + self._stderr = "" + self._returncode = None + self._success = False + self._lock = threading.Lock() + self._output_updated = threading.Event() + + def run(self): + for dep in self.dependencies: + dep.join() + if not dep.success: + with self._lock: + self._stderr = f"Dependency {dep.name} failed. Skipping." + return + + logger.debug(f"[{self.name}] Starting...") + try: + process = subprocess.Popen( + self.command, + shell=True, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True + ) + + # Read output line by line (for streaming) + for line in iter(process.stdout.readline, ''): + with self._lock: + self._stdout += line + self._output_updated.set() + + process.stdout.close() + process.wait() + + with self._lock: + self._returncode = process.returncode + self._success = (process.returncode == 0) + print(f"[{self.name}] Completed with code {self._returncode}.") + except Exception as e: + with self._lock: + self._stderr += f"Exception: {e}\n" + self._success = False + + @property + def stdout(self): + with self._lock: + return self._stdout + + @property + def success(self): + return self._success + + @property + def returncode(self): + return self._returncode + + def follow_logs(self): + """Generator that yields new output lines as they come in.""" + last_index = 0 + while self.is_alive() or last_index < len(self._stdout): + self._output_updated.wait(timeout=1) + with self._lock: + data = self._stdout + new_data = data[last_index:] + last_index = len(data) + self._output_updated.clear() + if new_data: + yield new_data diff --git a/python/kubeflow/trainer/local/runtimes.py b/python/kubeflow/trainer/local/runtimes.py new file mode 100644 index 000000000..9b174dd91 --- /dev/null +++ b/python/kubeflow/trainer/local/runtimes.py @@ -0,0 +1,31 @@ + +# Copyright 2025 The Kubeflow Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from kubeflow.trainer.types import local as local_types, types + + +runtimes = [ + local_types.LocalRuntime( + name="torch-distributed", + trainer=types.Trainer( + trainer_type=types.TrainerType.CUSTOM_TRAINER, + framework=types.Framework.TORCH, + entrypoint=["torchrun"], + ), + pretrained_model="", + command=["torchrun"], + create_venv=True, + ) +] \ No newline at end of file diff --git a/python/kubeflow/trainer/types/backends.py b/python/kubeflow/trainer/types/backends.py new file mode 100644 index 000000000..c10672bfb --- /dev/null +++ b/python/kubeflow/trainer/types/backends.py @@ -0,0 +1,34 @@ +# Copyright 2025 The Kubeflow Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional + +from pydantic import BaseModel +from kubernetes import client + +class BackendConfig(BaseModel): + pass + +class K8SBackendConfig(BackendConfig): + namespace: Optional[str] = None + config_file: Optional[str] = None + context: Optional[str] = None + client_configuration: Optional[client.Configuration] = None + + class Config: + arbitrary_types_allowed = True + +class LocalProcessBackendConfig(BackendConfig): + create_env: Optional[bool] = True + diff --git a/python/kubeflow/trainer/types/local.py b/python/kubeflow/trainer/types/local.py new file mode 100644 index 000000000..d0aaf91c4 --- /dev/null +++ b/python/kubeflow/trainer/types/local.py @@ -0,0 +1,54 @@ +# Copyright 2025 The Kubeflow Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from typing import List, Optional, Union +from dataclasses import dataclass, field +from pathlib import Path +import sys + +from kubeflow.trainer.types import types +from kubeflow.trainer.local.job import LocalJob + + +@dataclass +class LocalRuntime(types.Runtime): + create_venv: Optional[bool] = True + command: List[str] = field(default_factory=list) + python_path: Optional[str] = sys.executable + execution_dir: Optional[str] = None + + def get_executable_command(self) -> str: + venv_path = Path(self.execution_dir) + command_str = " ".join(self.command).lstrip() + if self.create_venv: + if os.name == 'nt': + # Windows + command_exe = venv_path / "Scripts" / command_str + else: + # Unix / macOS + command_exe = venv_path / "bin" / command_str + else: + command_exe = command_str + + # @szaher need to make sure venv is created before this check + # if not command_exe.exists(): + # raise FileNotFoundError(f"Python executable not found in virtualenv at: {command_exe}") + + return str(command_exe) + + +@dataclass +class LocalTrainJob(types.TrainJob): + job: LocalJob = None diff --git a/python/kubeflow/trainer/types/types.py b/python/kubeflow/trainer/types/types.py index 97b375c1d..e46883148 100644 --- a/python/kubeflow/trainer/types/types.py +++ b/python/kubeflow/trainer/types/types.py @@ -1,4 +1,4 @@ -# Copyright 2024 The Kubeflow Authors. +# Copyright 2024-2025 The Kubeflow Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/python/kubeflow/trainer/utils/local.py b/python/kubeflow/trainer/utils/local.py new file mode 100644 index 000000000..986bb5308 --- /dev/null +++ b/python/kubeflow/trainer/utils/local.py @@ -0,0 +1,163 @@ +# Copyright 2025 The Kubeflow Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import inspect +import logging +import os +import sys +from pathlib import Path +import textwrap +from typing import List, Callable, Optional, Dict, Any, Tuple +from kubeflow.trainer.constants import constants +from kubeflow.trainer.types import local as local_types + +logger = logging.getLogger(__name__) + +# @szaher we don't need this one but could be useful for discussion +def setup_kubeflow_local_config_dir(): + logger.debug("Setting up kubeflow local config dir %s", constants.DEFAULT_CFG_DIR) + + def _create_dir(dir_name: str, base_dir: str = ""): + if base_dir: + dir_name = os.path.join(base_dir, dir_name) + os.makedirs(dir_name, exist_ok=True) + + cfg_dirs = constants.DEFAULT_CFG_SUB_DIRS.copy() + + # create base directory + _create_dir(constants.DEFAULT_CFG_DIR) + + # create nested directories + [ + _create_dir(dir_name=_dir, base_dir=constants.DEFAULT_CFG_DIR) for _dir in cfg_dirs + ] + + +def get_venv_python_path(venv_dir: str) -> str: + venv_path = Path(venv_dir) + if os.name == 'nt': + # Windows + python_exe = venv_path / "Scripts" / "python.exe" + else: + # Unix / macOS + python_exe = venv_path / "bin" / "python" + + if not python_exe.exists(): + raise FileNotFoundError(f"Python executable not found in virtualenv at: {python_exe}") + + return str(python_exe) + + +def get_script_for_local_python_packages_install( + packages_to_install: List[str], + pip_index_url: str, + as_user: str = None, + python_binary: str = sys.executable, +) -> str: + """ + Get init script to install Python packages from the given pip index URL. + """ + packages_str = " ".join([str(package) for package in packages_to_install]) + + script_for_python_packages = textwrap.dedent( + """ + if ! [ -x "$(command -v pip)" ]; then + {python_binary} -m ensurepip || {python_binary} -m ensurepip --user + fi + + PIP_DISABLE_PIP_VERSION_CHECK=1 {python_binary} -m pip install --quiet \ + --no-warn-script-location --index-url {index_url} {packages} {as_user} + """.format( + python_binary=python_binary, + index_url=pip_index_url, + packages=packages_str, + as_user="--user" if as_user else "", + ) + ) + + return script_for_python_packages + + + + +def build_local_training_executable( + runtime: local_types.LocalRuntime, + train_func: Callable, + train_func_parameters: Optional[Dict[str, Any]], + pip_index_url: str, + packages_to_install: Optional[List[str]] = None, +) -> Tuple[str, List[str]]: + """ + Get the Trainer command and args from the given training function and parameters. + """ + # Check if training function is callable. + if not callable(train_func): + raise ValueError( + f"Training function must be callable, got function type: {type(train_func)}" + ) + + # Extract the function implementation. + func_code = inspect.getsource(train_func) + + # Extract the file name where the function is defined. + func_file = os.path.basename(inspect.getfile(train_func)) + + # Function might be defined in some indented scope (e.g. in another function). + # We need to dedent the function code. + func_code = textwrap.dedent(func_code) + + # Wrap function code to execute it from the file. For example: + # TODO (andreyvelich): Find a better way to run users' scripts. + # def train(parameters): + # print('Start Training...') + # train({'lr': 0.01}) + if train_func_parameters is None: + func_code = f"{func_code}\n{train_func.__name__}()\n" + else: + func_code = f"{func_code}\n{train_func.__name__}({train_func_parameters})\n" + + command = runtime.get_executable_command() + python_entrypoint = runtime.python_path + + exec_script = textwrap.dedent( + """ + read -r -d '' SCRIPT << EOM\n + {func_code} + EOM + printf "%s" \"$SCRIPT\" > \"{func_file}\" + {python_entrypoint} \"{func_file}\"""" + ) + + # Add function code to the execute script. + exec_script = exec_script.format( + func_code=func_code, + func_file=func_file, + python_entrypoint=command, + ) + + # Install Python packages if that is required. + if packages_to_install is not None: + exec_script = ( + get_script_for_local_python_packages_install( + packages_to_install=packages_to_install, + pip_index_url=pip_index_url, + python_binary=runtime.python_path + ) + + exec_script + ) + + # Return container command and args to execute training function. + return command, [exec_script] + + From 908af68780fd10347feb5ece99ed3ba9d60279e2 Mon Sep 17 00:00:00 2001 From: Saad Zaher Date: Wed, 25 Jun 2025 12:11:44 +0100 Subject: [PATCH 02/33] Implement Job Cancellation Signed-off-by: Saad Zaher --- python/kubeflow/trainer/api/trainer_client.py | 2 +- .../trainer/backends/local_process.py | 71 ++++++++++------ python/kubeflow/trainer/local/job.py | 85 +++++++++++++++---- python/kubeflow/trainer/types/backends.py | 2 +- python/kubeflow/trainer/types/types.py | 2 +- python/kubeflow/trainer/utils/local.py | 1 + 6 files changed, 118 insertions(+), 45 deletions(-) diff --git a/python/kubeflow/trainer/api/trainer_client.py b/python/kubeflow/trainer/api/trainer_client.py index 7b61c5463..48cd2015c 100644 --- a/python/kubeflow/trainer/api/trainer_client.py +++ b/python/kubeflow/trainer/api/trainer_client.py @@ -27,7 +27,7 @@ logger = logging.getLogger(__name__) -class TrainerClient(object): +class TrainerClient: def __init__(self, backend_type: Optional[str] = "kubernetes", backend_config: Optional[BackendConfig] = None): """ diff --git a/python/kubeflow/trainer/backends/local_process.py b/python/kubeflow/trainer/backends/local_process.py index b9aa96004..5f19568d8 100644 --- a/python/kubeflow/trainer/backends/local_process.py +++ b/python/kubeflow/trainer/backends/local_process.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import datetime import logging import tempfile import venv @@ -49,12 +48,18 @@ def list_runtimes(self) -> List[types.Runtime]: return local_runtimes.runtimes - def get_runtime(self, name: str) -> Optional[types.Runtime]: - """Get the the Runtime object""" - if name not in local_runtimes: + def get_runtime(self, name: str) -> Optional[local_types.LocalRuntime]: + """Get the the Runtime object + Returns: + LocalRuntime: Runtime object for the given name. + Raises: + ValueError: If no Runtime is found for the given name. + """ + _runtime = [rt for rt in local_runtimes.runtimes if rt.name == name] + if not _runtime: raise ValueError(f"Runtime '{name}' not found.") - return local_runtimes[name] + return _runtime[0] def train(self, train_job_name:str, @@ -62,7 +67,7 @@ def train(self, train_job_name:str, initializer: Optional[types.Initializer] = None, trainer: Optional[types.Trainer] = None) -> str: """ - Create the TrainJob. You can configure these types of training task: + Create the LocalTrainJob. You can configure these types of training task: - Custom Training Task: Training with a self-contained function that encapsulates the entire model training process, e.g. `CustomTrainer`. @@ -80,8 +85,6 @@ def train(self, train_job_name:str, Raises: ValueError: Input arguments are invalid. - TimeoutError: Timeout to create TrainJobs. - RuntimeError: Failed to create TrainJobs. """ # Build the env if not trainer: @@ -121,32 +124,42 @@ def train(self, train_job_name:str, return train_job_name def __create_venv(self, env_dir: str) -> None: + """Create Virtual Environment for the Training Job. + """ + # @szaher do we need to replace this with another LocalJob for Env preparation? venv.create(env_dir=env_dir, with_pip=True) def list_jobs(self, runtime: Optional[types.Runtime] = None) -> List[local_types.LocalTrainJob]: """List of all TrainJobs. Returns: - List[TrainerV1alpha1TrainJob]: List of created TrainJobs. + List[LocalTrainJob]: List of created LocalTrainJobs. If no TrainJob exist, an empty list is returned. - - Raises: - TimeoutError: Timeout to list TrainJobs. - RuntimeError: Failed to list TrainJobs. """ - result = [local_types.LocalTrainJob(name=j.name, creation_timestamp=datetime.datetime.now(), runtime=runtime, steps=[], job=j) for j in self.__jobs] + result = [ + local_types.LocalTrainJob( + name=j.name, creation_timestamp=j.creation_time, + runtime=runtime, steps=[], job=j, + ) + for j in self.__jobs + ] return result def get_job(self, name: str) -> Optional[local_types.LocalTrainJob]: - """Get the TrainJob object""" + """Get the TrainJob object. + Returns: + LocalTrainJob: LocalTrainJob object. + Raises: + ValueError: if TrainJob does not exist. + """ j = [j for j in self.__jobs if j.name == name] if not j: raise ValueError("No TrainJob with name '%s'" % name) return local_types.LocalTrainJob( name=j[0].name, - creation_timestamp=datetime.datetime.now(), + creation_timestamp=j[0].completion_time, runtime=None, steps=[], job=j[0] ) @@ -155,15 +168,19 @@ def get_job_logs(self, follow: Optional[bool] = False, step: str = constants.NODE, node_rank: int = 0) -> List[str]: - """Get the logs from TrainJob""" + """Get the logs from TrainJob + Args: + name (`str`) : The name of the TrainJob. + follow (`Optional[bool]`): Follow the log stream or not (default: False). + step (`str`): Step number (default: 0) [NOT IMPLEMENTED]. + node_rank (`int`): Node rank (default: 0) [NOT IMPLEMENTED]. + Returns: + List[str]: List of logs from TrainJob. + Raises: + ValueError: if TrainJob does not exist. + """ j = self.get_job(name=name) - lines = [] - for line in j.job.follow_logs(): - print(line) - lines.append(line) - return lines - - + return j.job.logs(follow=follow) def delete_job(self, name: str) -> None: """Delete the TrainJob. @@ -172,8 +189,7 @@ def delete_job(self, name: str) -> None: name: Name of the TrainJob. Raises: - TimeoutError: Timeout to delete TrainJob. - RuntimeError: Failed to delete TrainJob. + ValueError: if TrainJob does not exist. """ # delete job from registry or job list @@ -181,6 +197,9 @@ def delete_job(self, name: str) -> None: if not target: raise ValueError("No TrainJob with name '%s'" % name) + # request process cancellation + target[0].cancel() + # remove the job from the list of jobs self.__jobs.remove(target[0]) diff --git a/python/kubeflow/trainer/local/job.py b/python/kubeflow/trainer/local/job.py index bb83f6298..758238867 100644 --- a/python/kubeflow/trainer/local/job.py +++ b/python/kubeflow/trainer/local/job.py @@ -15,57 +15,85 @@ import threading import subprocess import logging +from datetime import datetime +from typing import List logger = logging.getLogger(__name__) class LocalJob(threading.Thread): def __init__(self, name, command, dependencies=None): + """Create a LocalJob. Create a local subprocess with threading to allow users + to create background jobs. + :param name: The name of the job. + :type name: str + :param command: The command to run. + :type command: str + :param dependencies: The dependencies to run in the job. + :type dependencies: List[job.LocalJob] + """ super().__init__() self.name = name self.command = command self.dependencies = dependencies or [] self._stdout = "" - self._stderr = "" self._returncode = None self._success = False self._lock = threading.Lock() + self._process = None self._output_updated = threading.Event() + self._cancel_requested = threading.Event() + self._start_time = None + self._end_time = None def run(self): for dep in self.dependencies: dep.join() if not dep.success: with self._lock: - self._stderr = f"Dependency {dep.name} failed. Skipping." + self._stdout = f"Dependency {dep.name} failed. Skipping." return logger.debug(f"[{self.name}] Starting...") try: - process = subprocess.Popen( + self._start_time = datetime.now() + self._process = subprocess.Popen( self.command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, - text=True + text=True, + # @szaher how do we need to handle signals passed to child processes? + # preexec_fn=None if hasattr(subprocess, "CREATE_NEW_PROCESS_GROUP") else lambda: signal.signal(signal.SIGINT, signal.SIG_IGN), ) - # Read output line by line (for streaming) - for line in iter(process.stdout.readline, ''): + while True: + if self._cancel_requested.is_set(): + self._process.terminate() + self._stdout += "[TrainingCancelled]\n" + self._success = False + return + + # Read output line by line (for streaming) + output_line = self._process.stdout.readline() with self._lock: - self._stdout += line - self._output_updated.set() + if output_line: + self._stdout += output_line + self._output_updated.set() - process.stdout.close() - process.wait() + if not output_line and self._process.poll() is not None: + break + + self._process.stdout.close() + self._returncode = self._process.wait() + self._end_time = datetime.now() + self._success = (self._process.returncode == 0) + msg = f"[{self.name}] Completed with code {self._returncode} in {self._end_time - self._start_time} seconds." + self._stdout += msg - with self._lock: - self._returncode = process.returncode - self._success = (process.returncode == 0) - print(f"[{self.name}] Completed with code {self._returncode}.") except Exception as e: with self._lock: - self._stderr += f"Exception: {e}\n" + self._stdout += f"Exception: {e}\n" self._success = False @property @@ -77,11 +105,28 @@ def stdout(self): def success(self): return self._success + def cancel(self): + self._cancel_requested.set() + @property def returncode(self): return self._returncode - def follow_logs(self): + def logs(self, follow=False) -> List[str]: + """Print log lines""" + if not follow: + return self._stdout.splitlines() + output_lines = "" + try: + for line in next(self.__follow_logs()): + print(line, end="") + output_lines += line + except StopIteration: + pass + + return output_lines.splitlines() + + def __follow_logs(self): """Generator that yields new output lines as they come in.""" last_index = 0 while self.is_alive() or last_index < len(self._stdout): @@ -93,3 +138,11 @@ def follow_logs(self): self._output_updated.clear() if new_data: yield new_data + + @property + def creation_time(self): + return self._start_time + + @property + def completion_time(self): + return self._end_time diff --git a/python/kubeflow/trainer/types/backends.py b/python/kubeflow/trainer/types/backends.py index c10672bfb..463d223be 100644 --- a/python/kubeflow/trainer/types/backends.py +++ b/python/kubeflow/trainer/types/backends.py @@ -30,5 +30,5 @@ class Config: arbitrary_types_allowed = True class LocalProcessBackendConfig(BackendConfig): - create_env: Optional[bool] = True + pass diff --git a/python/kubeflow/trainer/types/types.py b/python/kubeflow/trainer/types/types.py index e46883148..97b375c1d 100644 --- a/python/kubeflow/trainer/types/types.py +++ b/python/kubeflow/trainer/types/types.py @@ -1,4 +1,4 @@ -# Copyright 2024-2025 The Kubeflow Authors. +# Copyright 2024 The Kubeflow Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/python/kubeflow/trainer/utils/local.py b/python/kubeflow/trainer/utils/local.py index 986bb5308..421f6a836 100644 --- a/python/kubeflow/trainer/utils/local.py +++ b/python/kubeflow/trainer/utils/local.py @@ -24,6 +24,7 @@ logger = logging.getLogger(__name__) + # @szaher we don't need this one but could be useful for discussion def setup_kubeflow_local_config_dir(): logger.debug("Setting up kubeflow local config dir %s", constants.DEFAULT_CFG_DIR) From 3d578c774a31789c5026c7687b69f3ccf6182142 Mon Sep 17 00:00:00 2001 From: Saad Zaher Date: Wed, 9 Jul 2025 02:25:12 +0100 Subject: [PATCH 03/33] update local job to add resouce limitation in k8s style Signed-off-by: Saad Zaher --- python/kubeflow/trainer/api/trainer_client.py | 28 +-- .../trainer/backends/local_process.py | 15 ++ .../kubeflow/trainer/constants/constants.py | 2 + python/kubeflow/trainer/local/job.py | 18 +- .../trainer/local/resource_manager.py | 162 ++++++++++++++++++ 5 files changed, 212 insertions(+), 13 deletions(-) create mode 100644 python/kubeflow/trainer/local/resource_manager.py diff --git a/python/kubeflow/trainer/api/trainer_client.py b/python/kubeflow/trainer/api/trainer_client.py index 7f19d7376..2d9aa05df 100644 --- a/python/kubeflow/trainer/api/trainer_client.py +++ b/python/kubeflow/trainer/api/trainer_client.py @@ -13,6 +13,7 @@ # limitations under the License. import logging +import os import random import string import uuid @@ -37,20 +38,27 @@ def __init__(self, backend_type: Optional[str] = "kubernetes", backend_config: O backend_type: name of the backend to be used. default is kubernetes. backend_config: backend configuration. default is None. """ - backend = self.__init_backend(backend_type, backend_config) - self.__backend = backend + # if KF_TRAINING_BACKEND environment variable is set, use it to initialize training backend + backend_type = os.environ.get(constants.KF_TRAINING_BACKEND_NAME, backend_type) + # initialize training backend + self.__backend = self.__init_backend(backend_type, backend_config) - def __init_backend(self, backendtype: str, backendconfig: BackendConfig): - backend = TRAINER_BACKEND_REGISTRY.get(backendtype.lower()) + def __init_backend(self, backend_type: str, backend_config: BackendConfig): + backend = TRAINER_BACKEND_REGISTRY.get(backend_type.lower()) if not backend: - raise ValueError("Unknown backend type '{}'".format(backendtype)) + raise ValueError("Unknown backend type '{}'".format(backend_type)) # load the backend class backend_cls = backend.get("backend_cls") # check if backend configuration is present - if not backendconfig: - backendconfig = backend.get("config_cls")() + if not backend_config: + backend_config = backend.get("config_cls")() + # check if provided backend config instance uses the correct config class + if not isinstance(backend_config, backend.get("config_cls")): + raise ValueError(f"Wrong Backend Configuration provided. " + f"{backend_type} requires config instance " + f"of type {backend.get('config_cls')}") # initialize the backend class with the user provided config - return backend_cls(cfg=backendconfig) + return backend_cls(cfg=backend_config) def list_runtimes(self): """List of the available Runtimes. @@ -110,7 +118,7 @@ def train(self, runtime: types.Runtime = types.DEFAULT_RUNTIME, initializer: Optional[types.Initializer] = None, trainer: Optional[Union[types.CustomTrainer, types.BuiltinTrainer]] = None, - ): + ) -> str: """ Create the TrainJob. You can configure these types of training task: - Custom Training Task: Training with a self-contained function that encapsulates @@ -134,4 +142,4 @@ def train(self, # TODO (andreyvelich): Discuss this TrainJob name generation. train_job_name = random.choice(string.ascii_lowercase) + uuid.uuid4().hex[:11] - self.__backend.train(train_job_name=train_job_name, runtime=runtime, initializer=initializer, trainer=trainer) + return self.__backend.train(train_job_name=train_job_name, runtime=runtime, initializer=initializer, trainer=trainer) diff --git a/python/kubeflow/trainer/backends/local_process.py b/python/kubeflow/trainer/backends/local_process.py index 5f19568d8..f059436ae 100644 --- a/python/kubeflow/trainer/backends/local_process.py +++ b/python/kubeflow/trainer/backends/local_process.py @@ -109,10 +109,25 @@ def train(self, train_job_name:str, trainer.packages_to_install, ) + memory_limit = None + cpu_limit = None + cpu_time = None + nice = 0 + + if hasattr(trainer, "resources_per_node"): + memory_limit = trainer.resources_per_node.get("memory") + cpu_limit = trainer.resources_per_node.get("cpu") + cpu_time = trainer.resources_per_node.get("cpu_time") + nice = trainer.resources_per_node.get("nice") + # prepare local job j = job.LocalJob( name=train_job_name, command=args, + cpu_time=cpu_time, + cpu_limit=cpu_limit, + mem_limit=memory_limit, + nice=nice, ) # register job in local list diff --git a/python/kubeflow/trainer/constants/constants.py b/python/kubeflow/trainer/constants/constants.py index 68f1cf637..c153741c1 100644 --- a/python/kubeflow/trainer/constants/constants.py +++ b/python/kubeflow/trainer/constants/constants.py @@ -131,6 +131,8 @@ # local execution +# environment variable to pass which training backend to use +KF_TRAINING_BACKEND_NAME = "KF_TRAINING_BACKEND" ## local config dir DEFAULT_CFG_DIR = os.path.expanduser("~/.kubeflow/trainer") # dir for storing local runtimes diff --git a/python/kubeflow/trainer/local/job.py b/python/kubeflow/trainer/local/job.py index 758238867..6803da944 100644 --- a/python/kubeflow/trainer/local/job.py +++ b/python/kubeflow/trainer/local/job.py @@ -16,13 +16,15 @@ import subprocess import logging from datetime import datetime -from typing import List +from typing import List, Union + +from kubeflow.trainer.local import resource_manager logger = logging.getLogger(__name__) class LocalJob(threading.Thread): - def __init__(self, name, command, dependencies=None): + def __init__(self, name, command: Union[List, str], mem_limit=None, cpu_time=None, cpu_limit=None, nice=0, dependencies=None): """Create a LocalJob. Create a local subprocess with threading to allow users to create background jobs. :param name: The name of the job. @@ -45,6 +47,11 @@ def __init__(self, name, command, dependencies=None): self._cancel_requested = threading.Event() self._start_time = None self._end_time = None + # limit cpu and memory resources + self.__memory_limit = mem_limit + self.__cpu_time = cpu_time + self.__cpu_limit = cpu_limit + self.__nice = nice def run(self): for dep in self.dependencies: @@ -64,7 +71,12 @@ def run(self): stderr=subprocess.STDOUT, text=True, # @szaher how do we need to handle signals passed to child processes? - # preexec_fn=None if hasattr(subprocess, "CREATE_NEW_PROCESS_GROUP") else lambda: signal.signal(signal.SIGINT, signal.SIG_IGN), + preexec_fn=lambda: resource_manager.setup_local_process( + mem_limit=self.__memory_limit, + cpu_time=self.__cpu_time, + cpu_limit=self.__cpu_limit, + nice=self.__nice, + ) ) while True: diff --git a/python/kubeflow/trainer/local/resource_manager.py b/python/kubeflow/trainer/local/resource_manager.py new file mode 100644 index 000000000..816d18c5a --- /dev/null +++ b/python/kubeflow/trainer/local/resource_manager.py @@ -0,0 +1,162 @@ +# Copyright 2025 The Kubeflow Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +""" +spawn_limited.py + +Cross-platform subprocess launcher that can: + • cap memory (bytes) + • cap total CPU-seconds + • pin to specific CPU cores + • lower priority (nice value) +""" +# POSIX‐only imports; will be skipped on Windows +import re +try: + import resource + import os +except ImportError: + resource = None + os = None + + +def _parse_mem_limit(limit): + """ + Parse a k8s-style memory string into bytes. + Accepts: + - int or float (interpreted as bytes) + - str: "", where suffix is one of: + K, KB, Ki, KiB, M, MB, Mi, MiB, G, GB, Gi, GiB, ... + Allows decimal or binary prefixes and optional 'B'. + """ + if isinstance(limit, (int, float)): + return int(limit) + s = limit.upper().strip() + m = re.fullmatch(r"(\d+(\.\d+)?)([KMGTPEmgtpe][iI]?[bB]?)?", s) + if not m: + raise ValueError(f"Invalid memory limit format: {limit!r}") + number = float(m.group(1)) + unit = (m.group(3) or "").upper() + + # mapping suffix → multiplier + mult = { + "K": 10**3, "KB": 10**3, + "KI": 2**10, "KIB": 2**10, + "M": 10**6, "MB": 10**6, + "MI": 2**20, "MIB": 2**20, + "G": 10**9, "GB": 10**9, + "GI": 2**30, "GIB": 2**30, + "T": 10**12, "TB": 10**12, + "TI": 2**40, "TIB": 2**40, + "P": 10**15, "PB": 10**15, + "PI": 2**50, "PIB": 2**50, + "E": 10**18, "EB": 10**18, + "EI": 2**60, "EIB": 2**60, + "": 1, + } + if unit not in mult: + raise ValueError(f"Unknown memory unit: {unit!r}") + return int(number * mult[unit]) + + +def _parse_cpu_limit(limit): + """ + Parse a k8s-style CPU string into a float number of CPUs. + - "250m" ⇒ 0.25 + - "1" ⇒ 1.0 + - 2 ⇒ 2.0 + """ + if isinstance(limit, (int, float)): + return float(limit) + s = limit.strip() + m = re.fullmatch(r"(\d+(\.\d+)?)(m?)", s, re.IGNORECASE) + if not m: + raise ValueError(f"Invalid CPU limit: {limit!r}") + number = float(m.group(1)) + if m.group(3).lower() == "m": + return number / 1000.0 + return number + + +def _limit_resources_posix(mem_limit=None, cpu_time=None, cpu_cores=None, nice=0): + """ + Called in the child (via preexec_fn) on Linux/macOS. + - mem_limit: int bytes or k8s-style str (e.g. "1Gi", "512M", "8G") + - cpu_time: int seconds; maximum CPU-seconds before SIGXCPU + - cpu_cores: iterable of core indices (Linux only) + - nice: added niceness + """ + # 1) Lower priority + if nice and os: + os.nice(nice) + + # 2) Memory limit + if mem_limit is not None and resource: + try: + bytes_limit = _parse_mem_limit(mem_limit) + resource.setrlimit(resource.RLIMIT_AS, (bytes_limit, bytes_limit)) + except Exception: + # Some platforms (e.g. macOS) may ignore or reject RLIMIT_AS + pass + + # 3) CPU‐seconds limit + if cpu_time and resource: + resource.setrlimit(resource.RLIMIT_CPU, (cpu_time, cpu_time)) + + # 4) CPU‐core affinity (Linux only) + if cpu_cores and hasattr(os, "sched_setaffinity"): + os.sched_setaffinity(0, set(cpu_cores)) + + +def setup_local_process(mem_limit=None, cpu_limit=None, cpu_time=None, nice=0): + """ + Set up the subporcess pre-exec function to set resource limits, ...etc. + + Arguments: + mem_limit – int/float bytes or k8s-style string ("8Gi", "512Mi", etc.) + cpu – k8s-style CPU string or number ("750m", "2", etc.) + *fractional CPUs are parsed but ignored; only whole cores used* + cpu_time – int seconds; maximum CPU-seconds before SIGXCPU + nice – int; niceness increment + + Returns: + Callable preexec_fn function that gets executed after the fork() and before exec() + """ + preexec = None + # --- Parse memory --- + mem_bytes = None + if mem_limit is not None: + mem_bytes = _parse_mem_limit(mem_limit) + + # --- Parse CPU and select cores --- + cpu_cores = None + if cpu_limit is not None: + cpu_units = _parse_cpu_limit(cpu_limit) + total_cores = os.cpu_count() or 1 + # Only the integer part is used for affinity + n_cores = min(int(cpu_units), total_cores) + cpu_cores = list(range(n_cores)) if n_cores > 0 else [] + + # --- Build preexec_fn for POSIX --- + preexec = None + if mem_bytes is not None or cpu_cores is not None or nice: + preexec = lambda: _limit_resources_posix( + mem_limit=mem_bytes, + cpu_time=cpu_time, + cpu_cores=cpu_cores, + nice=nice, + ) + + return preexec From bed8f7083a459ce933511064e4a66bb6534211a0 Mon Sep 17 00:00:00 2001 From: Saad Zaher Date: Wed, 9 Jul 2025 19:16:51 +0100 Subject: [PATCH 04/33] Update python/kubeflow/trainer/api/trainer_client.py Co-authored-by: Andrey Velichkevich Signed-off-by: Saad Zaher --- python/kubeflow/trainer/api/trainer_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/kubeflow/trainer/api/trainer_client.py b/python/kubeflow/trainer/api/trainer_client.py index 2d9aa05df..c52515076 100644 --- a/python/kubeflow/trainer/api/trainer_client.py +++ b/python/kubeflow/trainer/api/trainer_client.py @@ -80,7 +80,7 @@ def list_jobs(self, runtime: Optional[types.Runtime] = None): """List of all TrainJobs. Returns: - List[TrainerV1alpha1TrainJob]: List of created TrainJobs. + List[TrainJob]: List of created TrainJobs. If no TrainJob exist, an empty list is returned. Raises: From 28db17f8e2bd22b9ab50c7dd1cc30bf56ffd2cc2 Mon Sep 17 00:00:00 2001 From: Saad Zaher Date: Tue, 12 Aug 2025 12:17:42 +0300 Subject: [PATCH 05/33] Fix linting issues Signed-off-by: Saad Zaher --- kubeflow/trainer/api/trainer_client.py | 14 +++++--------- kubeflow/trainer/backends/__init__.py | 2 +- kubeflow/trainer/backends/local_process.py | 2 +- kubeflow/trainer/local/job.py | 8 ++++++-- kubeflow/trainer/local/resource_manager.py | 4 +++- kubeflow/trainer/local/runtimes.py | 2 +- kubeflow/trainer/types/local.py | 5 +++-- kubeflow/trainer/utils/local.py | 1 - 8 files changed, 20 insertions(+), 18 deletions(-) diff --git a/kubeflow/trainer/api/trainer_client.py b/kubeflow/trainer/api/trainer_client.py index fe559bb49..f639f4843 100644 --- a/kubeflow/trainer/api/trainer_client.py +++ b/kubeflow/trainer/api/trainer_client.py @@ -14,9 +14,6 @@ import logging import os -import random -import string -import uuid from typing import Optional, Union from kubeflow.trainer.constants import constants @@ -30,7 +27,10 @@ class TrainerClient: - def __init__(self, backend_type: Optional[str] = "kubernetes", backend_config: Optional[BackendConfig] = None): + def __init__( + self, backend_type: Optional[str] = "kubernetes", + backend_config: Optional[BackendConfig] = None + ): """ Initialize a trainer client. @@ -138,8 +138,4 @@ def train(self, TimeoutError: Timeout to create TrainJobs. RuntimeError: Failed to create TrainJobs. """ - # Generate unique name for the TrainJob. - # TODO (andreyvelich): Discuss this TrainJob name generation. - train_job_name = random.choice(string.ascii_lowercase) + uuid.uuid4().hex[:11] - - return self.__backend.train(runtime=runtime, initializer=initializer, trainer=trainer) \ No newline at end of file + return self.__backend.train(runtime=runtime, initializer=initializer, trainer=trainer) diff --git a/kubeflow/trainer/backends/__init__.py b/kubeflow/trainer/backends/__init__.py index cbc083f4c..136dacc13 100644 --- a/kubeflow/trainer/backends/__init__.py +++ b/kubeflow/trainer/backends/__init__.py @@ -25,4 +25,4 @@ "backend_cls": LocalProcessBackend, "config_cls": LocalProcessBackendConfig, } -} \ No newline at end of file +} diff --git a/kubeflow/trainer/backends/local_process.py b/kubeflow/trainer/backends/local_process.py index eb48605ce..842714b48 100644 --- a/kubeflow/trainer/backends/local_process.py +++ b/kubeflow/trainer/backends/local_process.py @@ -22,7 +22,7 @@ from kubeflow.trainer.constants import constants from kubeflow.trainer.types import types, local as local_types from kubeflow.trainer.types.backends import LocalProcessBackendConfig -from kubeflow.trainer.utils import utils, local as local_utils +from kubeflow.trainer.utils import local as local_utils from kubeflow.trainer.backends import base from kubeflow.trainer.local import runtimes as local_runtimes from kubeflow.trainer.local import job diff --git a/kubeflow/trainer/local/job.py b/kubeflow/trainer/local/job.py index 6803da944..8b730ca29 100644 --- a/kubeflow/trainer/local/job.py +++ b/kubeflow/trainer/local/job.py @@ -24,7 +24,10 @@ class LocalJob(threading.Thread): - def __init__(self, name, command: Union[List, str], mem_limit=None, cpu_time=None, cpu_limit=None, nice=0, dependencies=None): + def __init__( + self, name, command: Union[List, str], mem_limit=None, + cpu_time=None, cpu_limit=None, nice=0, dependencies=None + ): """Create a LocalJob. Create a local subprocess with threading to allow users to create background jobs. :param name: The name of the job. @@ -100,7 +103,8 @@ def run(self): self._returncode = self._process.wait() self._end_time = datetime.now() self._success = (self._process.returncode == 0) - msg = f"[{self.name}] Completed with code {self._returncode} in {self._end_time - self._start_time} seconds." + msg = (f"[{self.name}] Completed with code {self._returncode}" + f" in {self._end_time - self._start_time} seconds.") self._stdout += msg except Exception as e: diff --git a/kubeflow/trainer/local/resource_manager.py b/kubeflow/trainer/local/resource_manager.py index 816d18c5a..f2271994f 100644 --- a/kubeflow/trainer/local/resource_manager.py +++ b/kubeflow/trainer/local/resource_manager.py @@ -152,11 +152,13 @@ def setup_local_process(mem_limit=None, cpu_limit=None, cpu_time=None, nice=0): # --- Build preexec_fn for POSIX --- preexec = None if mem_bytes is not None or cpu_cores is not None or nice: - preexec = lambda: _limit_resources_posix( + def preexec_fn(): + return _limit_resources_posix( mem_limit=mem_bytes, cpu_time=cpu_time, cpu_cores=cpu_cores, nice=nice, ) + preexec = preexec_fn return preexec diff --git a/kubeflow/trainer/local/runtimes.py b/kubeflow/trainer/local/runtimes.py index f88234cb3..b0dc24afb 100644 --- a/kubeflow/trainer/local/runtimes.py +++ b/kubeflow/trainer/local/runtimes.py @@ -28,4 +28,4 @@ command=["torchrun"], create_venv=True, ) -] \ No newline at end of file +] diff --git a/kubeflow/trainer/types/local.py b/kubeflow/trainer/types/local.py index d0aaf91c4..0f0a2b6eb 100644 --- a/kubeflow/trainer/types/local.py +++ b/kubeflow/trainer/types/local.py @@ -13,7 +13,7 @@ # limitations under the License. import os -from typing import List, Optional, Union +from typing import List, Optional from dataclasses import dataclass, field from pathlib import Path import sys @@ -44,7 +44,8 @@ def get_executable_command(self) -> str: # @szaher need to make sure venv is created before this check # if not command_exe.exists(): - # raise FileNotFoundError(f"Python executable not found in virtualenv at: {command_exe}") + # raise FileNotFoundError( + # f"Python executable not found in virtualenv at: {command_exe}") return str(command_exe) diff --git a/kubeflow/trainer/utils/local.py b/kubeflow/trainer/utils/local.py index 421f6a836..dee4458c6 100644 --- a/kubeflow/trainer/utils/local.py +++ b/kubeflow/trainer/utils/local.py @@ -129,7 +129,6 @@ def build_local_training_executable( func_code = f"{func_code}\n{train_func.__name__}({train_func_parameters})\n" command = runtime.get_executable_command() - python_entrypoint = runtime.python_path exec_script = textwrap.dedent( """ From 7977cc42746019f407e9680f3e211dbc43c11632 Mon Sep 17 00:00:00 2001 From: Saad Zaher Date: Tue, 12 Aug 2025 14:13:58 +0300 Subject: [PATCH 06/33] fix unit tests Signed-off-by: Saad Zaher --- kubeflow/trainer/api/trainer_client_test.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kubeflow/trainer/api/trainer_client_test.py b/kubeflow/trainer/api/trainer_client_test.py index 7ea986c10..3ea964707 100644 --- a/kubeflow/trainer/api/trainer_client_test.py +++ b/kubeflow/trainer/api/trainer_client_test.py @@ -28,9 +28,10 @@ from unittest.mock import Mock, patch import pytest -from kubeflow.trainer.backends.k8s import K8SBackend +from kubeflow.trainer.backends import K8SBackend from kubeflow.trainer.constants import constants from kubeflow.trainer.types import types +from kubeflow.trainer.types.backends import K8SBackendConfig from kubeflow.trainer.utils import utils from kubeflow_trainer_api import models @@ -100,7 +101,7 @@ def trainer_client(request): read_namespaced_pod_log=Mock(side_effect=mock_read_namespaced_pod_log), ), ): - yield K8SBackend() + yield K8SBackend(K8SBackendConfig()) # -------------------------- From da0ce2fb90d22bc8ef98712d2ca8421a029d6172 Mon Sep 17 00:00:00 2001 From: Saad Zaher Date: Tue, 12 Aug 2025 15:59:20 +0300 Subject: [PATCH 07/33] add support wait_for_job_status Signed-off-by: Saad Zaher --- kubeflow/trainer/api/trainer_client.py | 31 +++++++++++++++++++++- kubeflow/trainer/backends/base.py | 29 +++++++++++++++++++- kubeflow/trainer/backends/local_process.py | 24 ++++++++++++++++- 3 files changed, 81 insertions(+), 3 deletions(-) diff --git a/kubeflow/trainer/api/trainer_client.py b/kubeflow/trainer/api/trainer_client.py index f639f4843..b829b130d 100644 --- a/kubeflow/trainer/api/trainer_client.py +++ b/kubeflow/trainer/api/trainer_client.py @@ -14,7 +14,7 @@ import logging import os -from typing import Optional, Union +from typing import Optional, Union, Set from kubeflow.trainer.constants import constants from kubeflow.trainer.types import types @@ -139,3 +139,32 @@ def train(self, RuntimeError: Failed to create TrainJobs. """ return self.__backend.train(runtime=runtime, initializer=initializer, trainer=trainer) + + def wait_for_job_status( + self, + name: str, + status: Set[str] = {constants.TRAINJOB_COMPLETE}, + timeout: int = 600, + polling_interval: int = 2, + ) -> types.TrainJob: + """Wait for TrainJob to reach the desired status + + Args: + name: Name of the TrainJob. + status: Set of expected statuses. It must be subset of Created, Running, Complete, and + Failed statuses. + timeout: How many seconds to wait until TrainJob reaches one of the expected conditions. + polling_interval: The polling interval in seconds to check TrainJob status. + + Returns: + TrainJob: The training job that reaches the desired status. + + Raises: + ValueError: The input values are incorrect. + RuntimeError: Failed to get TrainJob or TrainJob reaches unexpected Failed status. + TimeoutError: Timeout to wait for TrainJob status. + """ + return self.__backend.wait_for_job_status( + name=name, status=status, timeout=timeout, + polling_interval=polling_interval, + ) diff --git a/kubeflow/trainer/backends/base.py b/kubeflow/trainer/backends/base.py index 64fd2e72e..4988e5f3b 100644 --- a/kubeflow/trainer/backends/base.py +++ b/kubeflow/trainer/backends/base.py @@ -14,7 +14,7 @@ import abc -from typing import Dict, List, Optional +from typing import Dict, List, Optional, Set from kubeflow.trainer.constants import constants from kubeflow.trainer.types import types @@ -59,3 +59,30 @@ def get_job_logs(self, @abc.abstractmethod def delete_job(self, name: str) -> None: raise NotImplementedError() + + @abc.abstractmethod + def wait_for_job_status( + self, + name: str, + status: Set[str] = {constants.TRAINJOB_COMPLETE}, + timeout: int = 600, + polling_interval: int = 2, + ) -> types.TrainJob: + """Wait for TrainJob to reach the desired status + + Args: + name: Name of the TrainJob. + status: Set of expected statuses. It must be subset of Created, Running, Complete, and + Failed statuses. + timeout: How many seconds to wait until TrainJob reaches one of the expected conditions. + polling_interval: The polling interval in seconds to check TrainJob status. + + Returns: + TrainJob: The training job that reaches the desired status. + + Raises: + ValueError: The input values are incorrect. + RuntimeError: Failed to get TrainJob or TrainJob reaches unexpected Failed status. + TimeoutError: Timeout to wait for TrainJob status. + """ + raise NotImplementedError() diff --git a/kubeflow/trainer/backends/local_process.py b/kubeflow/trainer/backends/local_process.py index 842714b48..dd94a6dc2 100644 --- a/kubeflow/trainer/backends/local_process.py +++ b/kubeflow/trainer/backends/local_process.py @@ -17,7 +17,7 @@ import uuid import venv import random -from typing import List, Optional +from typing import List, Optional, Set from kubeflow.trainer.constants import constants from kubeflow.trainer.types import types, local as local_types @@ -220,6 +220,28 @@ def delete_job(self, name: str) -> None: # remove the job from the list of jobs self.__jobs.remove(target[0]) + def wait_for_job_status( + self, + name: str, + status: Set[str] = {constants.TRAINJOB_COMPLETE}, + timeout: int = 600, + polling_interval: int = 2, + ) -> types.TrainJob: + """Wait for TrainJob to reach the completed status + + Args: + name: Name of the TrainJob. + status: Set of expected statuses. It must be subset of Created, Running, Complete, and + Failed statuses. + timeout: How many seconds to wait until TrainJob reaches one of the expected conditions. + polling_interval: The polling interval in seconds to check TrainJob status. + + Returns: + TrainJob: The training job that reaches the desired status. + """ + local_job = self.get_job(name=name) + local_job.job.join(timeout=timeout) + return local_job From ca564d6eb9fc66cb5f30aca5e3ca981bbd9e50fe Mon Sep 17 00:00:00 2001 From: Saad Zaher Date: Tue, 19 Aug 2025 14:00:32 +0100 Subject: [PATCH 08/33] Update data types Signed-off-by: Saad Zaher --- kubeflow/trainer/__init__.py | 5 + kubeflow/trainer/api/trainer_client.py | 74 +++++++++------ kubeflow/trainer/backends/__init__.py | 14 ++- kubeflow/trainer/backends/base.py | 38 +++----- kubeflow/trainer/backends/k8s.py | 24 ----- kubeflow/trainer/backends/local_process.py | 104 ++++----------------- kubeflow/trainer/local/job.py | 12 +-- kubeflow/trainer/types/local.py | 55 ----------- kubeflow/trainer/types/types.py | 49 +++++++++- 9 files changed, 132 insertions(+), 243 deletions(-) delete mode 100644 kubeflow/trainer/types/local.py diff --git a/kubeflow/trainer/__init__.py b/kubeflow/trainer/__init__.py index dce03bbb7..20d3d016a 100644 --- a/kubeflow/trainer/__init__.py +++ b/kubeflow/trainer/__init__.py @@ -18,6 +18,9 @@ # Import the Kubeflow Trainer client. from kubeflow.trainer.api.trainer_client import TrainerClient # noqa: F401 +# Import Kubeflow Trainer Backends +from kubeflow.trainer.types.backends import K8SBackendConfig, LocalProcessBackendConfig + # Import the Kubeflow Trainer constants. from kubeflow.trainer.constants.constants import DATASET_PATH, MODEL_PATH # noqa: F401 @@ -55,4 +58,6 @@ "RuntimeTrainer", "TrainerClient", "TrainerType", + "K8SBackendConfig", + "LocalProcessBackendConfig" ] diff --git a/kubeflow/trainer/api/trainer_client.py b/kubeflow/trainer/api/trainer_client.py index b829b130d..c6003ea72 100644 --- a/kubeflow/trainer/api/trainer_client.py +++ b/kubeflow/trainer/api/trainer_client.py @@ -13,54 +13,44 @@ # limitations under the License. import logging -import os -from typing import Optional, Union, Set +from typing import Dict, Optional, Union, Set, List, TypeAlias from kubeflow.trainer.constants import constants from kubeflow.trainer.types import types -from kubeflow.trainer.types.backends import BackendConfig +from kubeflow.trainer.types.backends import K8SBackendConfig, LocalProcessBackendConfig from kubeflow.trainer.backends import TRAINER_BACKEND_REGISTRY logger = logging.getLogger(__name__) +BackendCfg: TypeAlias = K8SBackendConfig | LocalProcessBackendConfig + class TrainerClient: - def __init__( - self, backend_type: Optional[str] = "kubernetes", - backend_config: Optional[BackendConfig] = None + def __init__( self, + backend_config: Optional[BackendCfg] = K8SBackendConfig() ): """ Initialize a trainer client. Args: - backend_type: name of the backend to be used. default is kubernetes. - backend_config: backend configuration. default is None. + backend_config: Backend configuration. Either K8SBackendConfig or + LocalProcessBackendConfig, or None to use the backend's + default config class. Defaults to K8SBackendConfig. """ - # if KF_TRAINING_BACKEND environment variable is set, use it to initialize training backend - backend_type = os.environ.get(constants.KF_TRAINING_BACKEND_NAME, backend_type) # initialize training backend - self.__backend = self.__init_backend(backend_type, backend_config) + self.__backend = self.__init_backend(backend_config) - def __init_backend(self, backend_type: str, backend_config: BackendConfig): - backend = TRAINER_BACKEND_REGISTRY.get(backend_type.lower()) + def __init_backend(self, backend_config: BackendCfg): + backend = TRAINER_BACKEND_REGISTRY.get(backend_config.__class__) if not backend: - raise ValueError("Unknown backend type '{}'".format(backend_type)) - # load the backend class - backend_cls = backend.get("backend_cls") - # check if backend configuration is present - if not backend_config: - backend_config = backend.get("config_cls")() - # check if provided backend config instance uses the correct config class - if not isinstance(backend_config, backend.get("config_cls")): - raise ValueError(f"Wrong Backend Configuration provided. " - f"{backend_type} requires config instance " - f"of type {backend.get('config_cls')}") + raise ValueError("Invalid backend config '{}'".format(backend_config)) + # initialize the backend class with the user provided config - return backend_cls(cfg=backend_config) + return backend(cfg=backend_config) - def list_runtimes(self): + def list_runtimes(self) -> types.RuntimeList: """List of the available Runtimes. Returns: @@ -73,10 +63,16 @@ def list_runtimes(self): """ return self.__backend.list_runtimes() - def get_runtime(self, name: str): + def get_runtime(self, name: str) -> types.TrainingRuntime: + """Get the Runtime object + Args: + name: Name of the runtime. + Returns: + types.TrainingRuntime: Runtime object. + """ return self.__backend.get_runtime(name=name) - def list_jobs(self, runtime: Optional[types.Runtime] = None): + def list_jobs(self, runtime: Optional[types.Runtime] = None) -> List[types.TrainJobLike]: """List of all TrainJobs. Returns: @@ -89,7 +85,7 @@ def list_jobs(self, runtime: Optional[types.Runtime] = None): """ return self.__backend.list_jobs(runtime=runtime) - def get_job(self, name: str): + def get_job(self, name: str) -> types.TrainJob: """Get the TrainJob object""" return self.__backend.get_job(name=name) @@ -110,7 +106,7 @@ def get_job_logs(self, follow: Optional[bool] = False, step: str = constants.NODE, node_rank: int = 0, - ): + )-> Dict[str, str]: """Get the logs from TrainJob""" return self.__backend.get_job_logs(name=name, follow=follow, step=step, node_rank=node_rank) @@ -146,7 +142,7 @@ def wait_for_job_status( status: Set[str] = {constants.TRAINJOB_COMPLETE}, timeout: int = 600, polling_interval: int = 2, - ) -> types.TrainJob: + ) -> types.TrainJobLike: """Wait for TrainJob to reach the desired status Args: @@ -168,3 +164,19 @@ def wait_for_job_status( name=name, status=status, timeout=timeout, polling_interval=polling_interval, ) + + def get_runtime_packages(self, runtime: types.TrainingRuntime): + """ + Print the installed Python packages for the given Runtime. If Runtime has GPUs it also + prints available GPUs on the single training node. + + Args: + runtime: Reference to one of existing Runtimes. + + Raises: + ValueError: Input arguments are invalid. + RuntimeError: Failed to get Runtime. + + """ + return self.__backend.get_runtime_packages(runtime=runtime) + diff --git a/kubeflow/trainer/backends/__init__.py b/kubeflow/trainer/backends/__init__.py index 136dacc13..064fb3855 100644 --- a/kubeflow/trainer/backends/__init__.py +++ b/kubeflow/trainer/backends/__init__.py @@ -16,13 +16,11 @@ from kubeflow.trainer.backends.local_process import LocalProcessBackend from kubeflow.trainer.types.backends import K8SBackendConfig, LocalProcessBackendConfig + TRAINER_BACKEND_REGISTRY = { - "kubernetes": { - "backend_cls": K8SBackend, - "config_cls": K8SBackendConfig, - }, - "local": { - "backend_cls": LocalProcessBackend, - "config_cls": LocalProcessBackendConfig, - } + cfg_class:backend_class for cfg_class, backend_class in + zip( + [K8SBackendConfig, LocalProcessBackendConfig], + [K8SBackend, LocalProcessBackend] + ) } diff --git a/kubeflow/trainer/backends/base.py b/kubeflow/trainer/backends/base.py index 4988e5f3b..acd99b068 100644 --- a/kubeflow/trainer/backends/base.py +++ b/kubeflow/trainer/backends/base.py @@ -22,11 +22,11 @@ class TrainingBackend(abc.ABC): @abc.abstractmethod - def list_runtimes(self) -> List[types.Runtime]: + def list_runtimes(self) -> types.RuntimeList: raise NotImplementedError() @abc.abstractmethod - def get_runtime(self, name: str) -> Optional[types.Runtime]: + def get_runtime(self, name: str) -> Optional[types.TrainingRuntime]: raise NotImplementedError() @abc.abstractmethod @@ -40,11 +40,11 @@ def train(self, @abc.abstractmethod def list_jobs( self, runtime: Optional[types.Runtime] = None - ) -> List[types.TrainJob]: + ) -> List[types.TrainJobLike]: raise NotImplementedError() @abc.abstractmethod - def get_job(self, name: str) -> Optional[types.TrainJob]: + def get_job(self, name: str) -> Optional[types.TrainJobLike]: raise NotImplementedError() @abc.abstractmethod @@ -62,27 +62,13 @@ def delete_job(self, name: str) -> None: @abc.abstractmethod def wait_for_job_status( - self, - name: str, - status: Set[str] = {constants.TRAINJOB_COMPLETE}, - timeout: int = 600, - polling_interval: int = 2, - ) -> types.TrainJob: - """Wait for TrainJob to reach the desired status - - Args: - name: Name of the TrainJob. - status: Set of expected statuses. It must be subset of Created, Running, Complete, and - Failed statuses. - timeout: How many seconds to wait until TrainJob reaches one of the expected conditions. - polling_interval: The polling interval in seconds to check TrainJob status. - - Returns: - TrainJob: The training job that reaches the desired status. + self, + name: str, + status: Set[str] = {constants.TRAINJOB_COMPLETE}, + timeout: int = 600, + polling_interval: int = 2, + ) -> types.TrainJobLike: + raise NotImplementedError() - Raises: - ValueError: The input values are incorrect. - RuntimeError: Failed to get TrainJob or TrainJob reaches unexpected Failed status. - TimeoutError: Timeout to wait for TrainJob status. - """ + def get_runtime_packages(self, runtime: types.TrainingRuntime): raise NotImplementedError() diff --git a/kubeflow/trainer/backends/k8s.py b/kubeflow/trainer/backends/k8s.py index d3c22d38f..23a174d41 100644 --- a/kubeflow/trainer/backends/k8s.py +++ b/kubeflow/trainer/backends/k8s.py @@ -59,16 +59,6 @@ def __init__(self, self.namespace = cfg.namespace def list_runtimes(self) -> List[types.Runtime]: - """List of the available Runtimes. - - Returns: - List[Runtime]: List of available training runtimes. - If no runtimes exist, an empty list is returned. - - Raises: - TimeoutError: Timeout to list Runtimes. - RuntimeError: Failed to list Runtimes. - """ result = [] try: @@ -113,8 +103,6 @@ def list_runtimes(self) -> List[types.Runtime]: return result def get_runtime(self, name: str) -> types.Runtime: - """Get the the Runtime object""" - try: thread = self.custom_api.get_cluster_custom_object( constants.GROUP, @@ -142,18 +130,6 @@ def get_runtime(self, name: str) -> types.Runtime: return self.__get_runtime_from_crd(runtime) # type: ignore def get_runtime_packages(self, runtime: types.Runtime): - """ - Print the installed Python packages for the given Runtime. If Runtime has GPUs it also - prints available GPUs on the single training node. - - Args: - runtime: Reference to one of existing Runtimes. - - Raises: - ValueError: Input arguments are invalid. - RuntimeError: Failed to get Runtime. - - """ if runtime.trainer.trainer_type == types.TrainerType.BUILTIN_TRAINER: raise ValueError("Cannot get Runtime packages for BuiltinTrainer") diff --git a/kubeflow/trainer/backends/local_process.py b/kubeflow/trainer/backends/local_process.py index dd94a6dc2..eddd50856 100644 --- a/kubeflow/trainer/backends/local_process.py +++ b/kubeflow/trainer/backends/local_process.py @@ -17,10 +17,10 @@ import uuid import venv import random -from typing import List, Optional, Set +from typing import List, Optional, Set, Dict from kubeflow.trainer.constants import constants -from kubeflow.trainer.types import types, local as local_types +from kubeflow.trainer.types import types from kubeflow.trainer.types.backends import LocalProcessBackendConfig from kubeflow.trainer.utils import local as local_utils from kubeflow.trainer.backends import base @@ -41,23 +41,12 @@ def __init__(self, self.cfg = cfg - def list_runtimes(self) -> List[types.Runtime]: - """List of the available Runtimes. - - Returns: - List[Runtime]: List of available local training runtimes. - If no runtimes exist, an empty list is returned. - """ + def list_runtimes(self) -> List[types.LocalRuntime]: return local_runtimes.runtimes - def get_runtime(self, name: str) -> Optional[local_types.LocalRuntime]: - """Get the the Runtime object - Returns: - LocalRuntime: Runtime object for the given name. - Raises: - ValueError: If no Runtime is found for the given name. - """ + def get_runtime(self, name: str) -> Optional[types.LocalRuntime]: + _runtime = [rt for rt in local_runtimes.runtimes if rt.name == name] if not _runtime: raise ValueError(f"Runtime '{name}' not found.") @@ -66,28 +55,10 @@ def get_runtime(self, name: str) -> Optional[local_types.LocalRuntime]: def train(self, - runtime: local_types.LocalRuntime, + runtime: types.LocalRuntime, initializer: Optional[types.Initializer] = None, trainer: Optional[types.RuntimeTrainer] = None) -> str: - """ - Create the LocalTrainJob. You can configure these types of training task: - - Custom Training Task: Training with a self-contained function that encapsulates - the entire model training process, e.g. `CustomTrainer`. - - Args: - runtime (`types.Runtime`): Reference to one of existing Runtimes. - initializer (`Optional[types.Initializer]`): - Configuration for the dataset and model initializers. - trainer (`Optional[types.CustomTrainer]`): - Configuration for Custom Training Task. - - Returns: - str: The unique name of the TrainJob that has been generated. - - Raises: - ValueError: Input arguments are invalid. - """ train_job_name = random.choice(string.ascii_lowercase) + uuid.uuid4().hex[:11] # Build the env if not trainer: @@ -147,16 +118,10 @@ def __create_venv(self, env_dir: str) -> None: # @szaher do we need to replace this with another LocalJob for Env preparation? venv.create(env_dir=env_dir, with_pip=True) - def list_jobs(self, runtime: Optional[types.Runtime] = None) -> List[local_types.LocalTrainJob]: - """List of all TrainJobs. - - Returns: - List[LocalTrainJob]: List of created LocalTrainJobs. - If no TrainJob exist, an empty list is returned. - """ + def list_jobs(self, runtime: Optional[types.Runtime] = None) -> List[types.LocalTrainJob]: result = [ - local_types.LocalTrainJob( + types.LocalTrainJob( name=j.name, creation_timestamp=j.creation_time, runtime=runtime, steps=[], job=j, num_nodes=len(self.__jobs), ) @@ -165,17 +130,11 @@ def list_jobs(self, runtime: Optional[types.Runtime] = None) -> List[local_types return result - def get_job(self, name: str) -> Optional[local_types.LocalTrainJob]: - """Get the TrainJob object. - Returns: - LocalTrainJob: LocalTrainJob object. - Raises: - ValueError: if TrainJob does not exist. - """ + def get_job(self, name: str) -> Optional[types.LocalTrainJob]: j = [j for j in self.__jobs if j.name == name] if not j: raise ValueError("No TrainJob with name '%s'" % name) - return local_types.LocalTrainJob( + return types.LocalTrainJob( name=j[0].name, creation_timestamp=j[0].completion_time, runtime=None, steps=[], job=j[0], num_nodes=len(self.__jobs), @@ -185,31 +144,13 @@ def get_job_logs(self, name: str, follow: Optional[bool] = False, step: str = constants.NODE, - node_rank: int = 0) -> List[str]: - """Get the logs from TrainJob - Args: - name (`str`) : The name of the TrainJob. - follow (`Optional[bool]`): Follow the log stream or not (default: False). - step (`str`): Step number (default: 0) [NOT IMPLEMENTED]. - node_rank (`int`): Node rank (default: 0) [NOT IMPLEMENTED]. - Returns: - List[str]: List of logs from TrainJob. - Raises: - ValueError: if TrainJob does not exist. - """ + node_rank: int = 0) -> Dict[str, str]: j = self.get_job(name=name) - return j.job.logs(follow=follow) - - def delete_job(self, name: str) -> None: - """Delete the TrainJob. - - Args: - name: Name of the TrainJob. - - Raises: - ValueError: if TrainJob does not exist. - """ + return { + j.name: j.job.logs(follow=follow), + } + def delete_job(self, name: str): # delete job from registry or job list target = [j for j in self.__jobs if j.name == name] if not target: @@ -226,19 +167,8 @@ def wait_for_job_status( status: Set[str] = {constants.TRAINJOB_COMPLETE}, timeout: int = 600, polling_interval: int = 2, - ) -> types.TrainJob: - """Wait for TrainJob to reach the completed status - - Args: - name: Name of the TrainJob. - status: Set of expected statuses. It must be subset of Created, Running, Complete, and - Failed statuses. - timeout: How many seconds to wait until TrainJob reaches one of the expected conditions. - polling_interval: The polling interval in seconds to check TrainJob status. - - Returns: - TrainJob: The training job that reaches the desired status. - """ + ) -> types.LocalTrainJob: + local_job = self.get_job(name=name) local_job.job.join(timeout=timeout) return local_job diff --git a/kubeflow/trainer/local/job.py b/kubeflow/trainer/local/job.py index 8b730ca29..0f9e1ffb7 100644 --- a/kubeflow/trainer/local/job.py +++ b/kubeflow/trainer/local/job.py @@ -26,7 +26,7 @@ class LocalJob(threading.Thread): def __init__( self, name, command: Union[List, str], mem_limit=None, - cpu_time=None, cpu_limit=None, nice=0, dependencies=None + cpu_time=None, cpu_limit=None, nice=0 ): """Create a LocalJob. Create a local subprocess with threading to allow users to create background jobs. @@ -34,13 +34,10 @@ def __init__( :type name: str :param command: The command to run. :type command: str - :param dependencies: The dependencies to run in the job. - :type dependencies: List[job.LocalJob] """ super().__init__() self.name = name self.command = command - self.dependencies = dependencies or [] self._stdout = "" self._returncode = None self._success = False @@ -57,13 +54,6 @@ def __init__( self.__nice = nice def run(self): - for dep in self.dependencies: - dep.join() - if not dep.success: - with self._lock: - self._stdout = f"Dependency {dep.name} failed. Skipping." - return - logger.debug(f"[{self.name}] Starting...") try: self._start_time = datetime.now() diff --git a/kubeflow/trainer/types/local.py b/kubeflow/trainer/types/local.py deleted file mode 100644 index 0f0a2b6eb..000000000 --- a/kubeflow/trainer/types/local.py +++ /dev/null @@ -1,55 +0,0 @@ -# Copyright 2025 The Kubeflow Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -from typing import List, Optional -from dataclasses import dataclass, field -from pathlib import Path -import sys - -from kubeflow.trainer.types import types -from kubeflow.trainer.local.job import LocalJob - - -@dataclass -class LocalRuntime(types.Runtime): - create_venv: Optional[bool] = True - command: List[str] = field(default_factory=list) - python_path: Optional[str] = sys.executable - execution_dir: Optional[str] = None - - def get_executable_command(self) -> str: - venv_path = Path(self.execution_dir) - command_str = " ".join(self.command).lstrip() - if self.create_venv: - if os.name == 'nt': - # Windows - command_exe = venv_path / "Scripts" / command_str - else: - # Unix / macOS - command_exe = venv_path / "bin" / command_str - else: - command_exe = command_str - - # @szaher need to make sure venv is created before this check - # if not command_exe.exists(): - # raise FileNotFoundError( - # f"Python executable not found in virtualenv at: {command_exe}") - - return str(command_exe) - - -@dataclass -class LocalTrainJob(types.TrainJob): - job: LocalJob = None diff --git a/kubeflow/trainer/types/types.py b/kubeflow/trainer/types/types.py index 778f1a1df..c1c867dda 100644 --- a/kubeflow/trainer/types/types.py +++ b/kubeflow/trainer/types/types.py @@ -13,12 +13,16 @@ # limitations under the License. +import os +from pathlib import Path +import sys from dataclasses import dataclass, field from datetime import datetime from enum import Enum -from typing import Callable, Dict, Optional +from typing import Callable, Dict, Optional, List, Union, TypeAlias from kubeflow.trainer.constants import constants +from kubeflow.trainer.local.job import LocalJob # Configuration for the Custom Trainer. @@ -238,3 +242,46 @@ class Initializer: dataset: Optional[HuggingFaceDatasetInitializer] = None model: Optional[HuggingFaceModelInitializer] = None + + +# local execution types + +@dataclass +class LocalRuntime(Runtime): + create_venv: Optional[bool] = True + command: List[str] = field(default_factory=list) + python_path: Optional[str] = sys.executable + execution_dir: Optional[str] = None + + def get_executable_command(self) -> str: + venv_path = Path(self.execution_dir) + command_str = " ".join(self.command).lstrip() + if self.create_venv: + if os.name == 'nt': + # Windows + command_exe = venv_path / "Scripts" / command_str + else: + # Unix / macOS + command_exe = venv_path / "bin" / command_str + else: + command_exe = command_str + + # @szaher need to make sure venv is created before this check + # if not command_exe.exists(): + # raise FileNotFoundError( + # f"Python executable not found in virtualenv at: {command_exe}") + + return str(command_exe) + + +@dataclass +class LocalTrainJob(TrainJob): + job: LocalJob = None + + +# Training Backends Types +# this can be simplified if we drop python3.9 support as follows +RuntimeList: TypeAlias = Union[List[Runtime], List[LocalRuntime]] +TrainingRuntime: TypeAlias = Union[Runtime, LocalRuntime] +TrainJobLike: TypeAlias = Union[TrainJob, LocalTrainJob] + From d9af6f28c103c4af9b2549d0efcbfaf0c737077a Mon Sep 17 00:00:00 2001 From: Saad Zaher Date: Tue, 19 Aug 2025 16:32:44 +0100 Subject: [PATCH 09/33] fix merge conflict Signed-off-by: Saad Zaher --- kubeflow/trainer/backends/k8s.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/kubeflow/trainer/backends/k8s.py b/kubeflow/trainer/backends/k8s.py index 23a174d41..d956fb874 100644 --- a/kubeflow/trainer/backends/k8s.py +++ b/kubeflow/trainer/backends/k8s.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import copy import logging import multiprocessing import queue @@ -103,6 +103,7 @@ def list_runtimes(self) -> List[types.Runtime]: return result def get_runtime(self, name: str) -> types.Runtime: + try: thread = self.custom_api.get_cluster_custom_object( constants.GROUP, @@ -134,11 +135,14 @@ def get_runtime_packages(self, runtime: types.Runtime): if runtime.trainer.trainer_type == types.TrainerType.BUILTIN_TRAINER: raise ValueError("Cannot get Runtime packages for BuiltinTrainer") + # Create a deepcopy of the runtime to avoid modifying the original command. + runtime_copy = copy.deepcopy(runtime) + # Run mpirun only within the single process. - if runtime.trainer.command[0] == "mpirun": + if runtime_copy.trainer.command[0] == "mpirun": mpi_command = list(constants.MPI_COMMAND) mpi_command[1:3] = ["-np", "1"] - runtime.trainer.set_command(tuple(mpi_command)) + runtime_copy.trainer.set_command(tuple(mpi_command)) def print_packages(): import subprocess @@ -168,12 +172,12 @@ def print_packages(): # Create the TrainJob and wait until it completes. # If Runtime trainer has GPU resources use them, otherwise run TrainJob with 1 CPU. job_name = self.train( - runtime=runtime, + runtime=runtime_copy, trainer=types.CustomTrainer( func=print_packages, num_nodes=1, resources_per_node=( - {"cpu": 1} if runtime.trainer.device != "gpu" else None + {"cpu": 1} if runtime_copy.trainer.device != "gpu" else None ), ), ) @@ -183,10 +187,10 @@ def print_packages(): self.delete_job(job_name) def train( - self, - runtime: Optional[types.Runtime] = None, - initializer: Optional[types.Initializer] = None, - trainer: Optional[Union[types.CustomTrainer, types.BuiltinTrainer]] = None, + self, + runtime: Optional[types.Runtime] = None, + initializer: Optional[types.Initializer] = None, + trainer: Optional[Union[types.CustomTrainer, types.BuiltinTrainer]] = None, ) -> str: """ Create the TrainJob. You can configure these types of training task: @@ -724,4 +728,3 @@ def __get_trainjob_from_crd( trainjob.status = constants.TRAINJOB_RUNNING return trainjob - From 46961ba4aa541ab7d5698ee5eb894f2ddaefc4a6 Mon Sep 17 00:00:00 2001 From: Saad Zaher Date: Wed, 20 Aug 2025 11:56:26 +0100 Subject: [PATCH 10/33] fix unit tests Signed-off-by: Saad Zaher --- kubeflow/trainer/local/runtimes.py | 4 ++-- kubeflow/trainer/utils/local.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/kubeflow/trainer/local/runtimes.py b/kubeflow/trainer/local/runtimes.py index b0dc24afb..cd969648e 100644 --- a/kubeflow/trainer/local/runtimes.py +++ b/kubeflow/trainer/local/runtimes.py @@ -13,11 +13,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -from kubeflow.trainer.types import local as local_types, types +from kubeflow.trainer.types import types runtimes = [ - local_types.LocalRuntime( + types.LocalRuntime( name="torch-distributed", trainer=types.RuntimeTrainer( trainer_type=types.TrainerType.CUSTOM_TRAINER, diff --git a/kubeflow/trainer/utils/local.py b/kubeflow/trainer/utils/local.py index dee4458c6..9104c97d4 100644 --- a/kubeflow/trainer/utils/local.py +++ b/kubeflow/trainer/utils/local.py @@ -20,7 +20,7 @@ import textwrap from typing import List, Callable, Optional, Dict, Any, Tuple from kubeflow.trainer.constants import constants -from kubeflow.trainer.types import local as local_types +from kubeflow.trainer.types import types logger = logging.getLogger(__name__) @@ -93,7 +93,7 @@ def get_script_for_local_python_packages_install( def build_local_training_executable( - runtime: local_types.LocalRuntime, + runtime: types.LocalRuntime, train_func: Callable, train_func_parameters: Optional[Dict[str, Any]], pip_index_url: str, From e226167eba74b16b1d54d496fbd543ec6c34539c Mon Sep 17 00:00:00 2001 From: Saad Zaher Date: Wed, 20 Aug 2025 17:15:05 +0100 Subject: [PATCH 11/33] remove TypeAlias Signed-off-by: Saad Zaher --- kubeflow/trainer/api/trainer_client.py | 4 ++-- kubeflow/trainer/types/types.py | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/kubeflow/trainer/api/trainer_client.py b/kubeflow/trainer/api/trainer_client.py index c6003ea72..7836c723c 100644 --- a/kubeflow/trainer/api/trainer_client.py +++ b/kubeflow/trainer/api/trainer_client.py @@ -13,7 +13,7 @@ # limitations under the License. import logging -from typing import Dict, Optional, Union, Set, List, TypeAlias +from typing import Dict, Optional, Union, Set, List from kubeflow.trainer.constants import constants from kubeflow.trainer.types import types @@ -23,7 +23,7 @@ logger = logging.getLogger(__name__) -BackendCfg: TypeAlias = K8SBackendConfig | LocalProcessBackendConfig +BackendCfg = Union[K8SBackendConfig, LocalProcessBackendConfig] class TrainerClient: diff --git a/kubeflow/trainer/types/types.py b/kubeflow/trainer/types/types.py index c1c867dda..3ed2e512b 100644 --- a/kubeflow/trainer/types/types.py +++ b/kubeflow/trainer/types/types.py @@ -19,7 +19,7 @@ from dataclasses import dataclass, field from datetime import datetime from enum import Enum -from typing import Callable, Dict, Optional, List, Union, TypeAlias +from typing import Callable, Dict, Optional, List, Union from kubeflow.trainer.constants import constants from kubeflow.trainer.local.job import LocalJob @@ -281,7 +281,7 @@ class LocalTrainJob(TrainJob): # Training Backends Types # this can be simplified if we drop python3.9 support as follows -RuntimeList: TypeAlias = Union[List[Runtime], List[LocalRuntime]] -TrainingRuntime: TypeAlias = Union[Runtime, LocalRuntime] -TrainJobLike: TypeAlias = Union[TrainJob, LocalTrainJob] +RuntimeList = Union[List[Runtime], List[LocalRuntime]] +TrainingRuntime = Union[Runtime, LocalRuntime] +TrainJobLike = Union[TrainJob, LocalTrainJob] From 2ef70dbaa78087ecaca1d2f1a4401e0ccb43bbf7 Mon Sep 17 00:00:00 2001 From: Saad Zaher Date: Wed, 20 Aug 2025 17:42:49 +0100 Subject: [PATCH 12/33] Replace TRAINER_BACKEND_REGISTRY with TRAINER_BACKEND Signed-off-by: Saad Zaher --- kubeflow/trainer/api/trainer_client.py | 4 ++-- kubeflow/trainer/backends/__init__.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/kubeflow/trainer/api/trainer_client.py b/kubeflow/trainer/api/trainer_client.py index 7836c723c..61436a73b 100644 --- a/kubeflow/trainer/api/trainer_client.py +++ b/kubeflow/trainer/api/trainer_client.py @@ -18,7 +18,7 @@ from kubeflow.trainer.constants import constants from kubeflow.trainer.types import types from kubeflow.trainer.types.backends import K8SBackendConfig, LocalProcessBackendConfig -from kubeflow.trainer.backends import TRAINER_BACKEND_REGISTRY +from kubeflow.trainer.backends import TRAINER_BACKENDS logger = logging.getLogger(__name__) @@ -43,7 +43,7 @@ def __init__( self, self.__backend = self.__init_backend(backend_config) def __init_backend(self, backend_config: BackendCfg): - backend = TRAINER_BACKEND_REGISTRY.get(backend_config.__class__) + backend = TRAINER_BACKENDS.get(backend_config.__class__) if not backend: raise ValueError("Invalid backend config '{}'".format(backend_config)) diff --git a/kubeflow/trainer/backends/__init__.py b/kubeflow/trainer/backends/__init__.py index 064fb3855..5d21e30b9 100644 --- a/kubeflow/trainer/backends/__init__.py +++ b/kubeflow/trainer/backends/__init__.py @@ -17,7 +17,7 @@ from kubeflow.trainer.types.backends import K8SBackendConfig, LocalProcessBackendConfig -TRAINER_BACKEND_REGISTRY = { +TRAINER_BACKENDS = { cfg_class:backend_class for cfg_class, backend_class in zip( [K8SBackendConfig, LocalProcessBackendConfig], From 822a2629924cf3af36bbe4d0c0e91be96788b6db Mon Sep 17 00:00:00 2001 From: Saad Zaher Date: Thu, 21 Aug 2025 23:13:58 +0100 Subject: [PATCH 13/33] Update kubeflow/trainer/api/trainer_client.py Co-authored-by: Andrey Velichkevich Signed-off-by: Saad Zaher --- kubeflow/trainer/api/trainer_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kubeflow/trainer/api/trainer_client.py b/kubeflow/trainer/api/trainer_client.py index 61436a73b..d291f8811 100644 --- a/kubeflow/trainer/api/trainer_client.py +++ b/kubeflow/trainer/api/trainer_client.py @@ -17,7 +17,7 @@ from kubeflow.trainer.constants import constants from kubeflow.trainer.types import types -from kubeflow.trainer.types.backends import K8SBackendConfig, LocalProcessBackendConfig +from kubeflow.trainer.types.backends import KubernetesBackendConfig, LocalProcessBackendConfig from kubeflow.trainer.backends import TRAINER_BACKENDS From f00280a436cb2ed02c16608bc4299c4c308892a4 Mon Sep 17 00:00:00 2001 From: Saad Zaher Date: Thu, 21 Aug 2025 23:14:18 +0100 Subject: [PATCH 14/33] Update kubeflow/trainer/api/trainer_client.py Co-authored-by: Andrey Velichkevich Signed-off-by: Saad Zaher --- kubeflow/trainer/api/trainer_client.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kubeflow/trainer/api/trainer_client.py b/kubeflow/trainer/api/trainer_client.py index d291f8811..6532cb1cc 100644 --- a/kubeflow/trainer/api/trainer_client.py +++ b/kubeflow/trainer/api/trainer_client.py @@ -31,8 +31,7 @@ class TrainerClient: def __init__( self, backend_config: Optional[BackendCfg] = K8SBackendConfig() ): - """ - Initialize a trainer client. + """Initialize a Kubeflow Trainer client. Args: backend_config: Backend configuration. Either K8SBackendConfig or From e0c714fc52693839a4c53533e924ce3fc6d72318 Mon Sep 17 00:00:00 2001 From: Saad Zaher Date: Fri, 22 Aug 2025 01:12:08 +0100 Subject: [PATCH 15/33] Restructure training backends into separate dirs Signed-off-by: Saad Zaher --- kubeflow/trainer/__init__.py | 5 ++-- kubeflow/trainer/api/trainer_client.py | 27 ++++++++++--------- kubeflow/trainer/api/trainer_client_test.py | 6 ++--- kubeflow/trainer/backends/__init__.py | 26 ------------------ .../kubernetes}/__init__.py | 0 .../trainer/backends/{ => kubernetes}/k8s.py | 6 ++--- .../kubernetes/types.py} | 13 +++------ kubeflow/trainer/backends/local/__init__.py | 0 kubeflow/trainer/{ => backends}/local/job.py | 2 +- .../backends/{ => local}/local_process.py | 12 ++++----- .../{ => backends}/local/resource_manager.py | 0 .../trainer/{ => backends}/local/runtimes.py | 0 kubeflow/trainer/backends/local/types.py | 18 +++++++++++++ .../local.py => backends/local/utils.py} | 0 kubeflow/trainer/types/types.py | 7 ++++- 15 files changed, 58 insertions(+), 64 deletions(-) rename kubeflow/trainer/{local => backends/kubernetes}/__init__.py (100%) rename kubeflow/trainer/backends/{ => kubernetes}/k8s.py (99%) rename kubeflow/trainer/{types/backends.py => backends/kubernetes/types.py} (83%) create mode 100644 kubeflow/trainer/backends/local/__init__.py rename kubeflow/trainer/{ => backends}/local/job.py (98%) rename kubeflow/trainer/backends/{ => local}/local_process.py (93%) rename kubeflow/trainer/{ => backends}/local/resource_manager.py (100%) rename kubeflow/trainer/{ => backends}/local/runtimes.py (100%) create mode 100644 kubeflow/trainer/backends/local/types.py rename kubeflow/trainer/{utils/local.py => backends/local/utils.py} (100%) diff --git a/kubeflow/trainer/__init__.py b/kubeflow/trainer/__init__.py index 20d3d016a..39ba4434e 100644 --- a/kubeflow/trainer/__init__.py +++ b/kubeflow/trainer/__init__.py @@ -19,7 +19,8 @@ from kubeflow.trainer.api.trainer_client import TrainerClient # noqa: F401 # Import Kubeflow Trainer Backends -from kubeflow.trainer.types.backends import K8SBackendConfig, LocalProcessBackendConfig +from kubeflow.trainer.backends.kubernetes.types import KubernetesBackendConfig +from kubeflow.trainer.backends.local.types import LocalProcessBackendConfig # Import the Kubeflow Trainer constants. from kubeflow.trainer.constants.constants import DATASET_PATH, MODEL_PATH # noqa: F401 @@ -58,6 +59,6 @@ "RuntimeTrainer", "TrainerClient", "TrainerType", - "K8SBackendConfig", + "KubernetesBackendConfig", "LocalProcessBackendConfig" ] diff --git a/kubeflow/trainer/api/trainer_client.py b/kubeflow/trainer/api/trainer_client.py index 6532cb1cc..cd8f21cc0 100644 --- a/kubeflow/trainer/api/trainer_client.py +++ b/kubeflow/trainer/api/trainer_client.py @@ -15,39 +15,40 @@ import logging from typing import Dict, Optional, Union, Set, List +from kubeflow.trainer.backends.kubernetes.k8s import KubernetesBackend +from kubeflow.trainer.backends.local.local_process import LocalProcessBackend from kubeflow.trainer.constants import constants from kubeflow.trainer.types import types -from kubeflow.trainer.types.backends import KubernetesBackendConfig, LocalProcessBackendConfig -from kubeflow.trainer.backends import TRAINER_BACKENDS +from kubeflow.trainer.backends.kubernetes.types import KubernetesBackendConfig +from kubeflow.trainer.backends.local.types import LocalProcessBackendConfig logger = logging.getLogger(__name__) -BackendCfg = Union[K8SBackendConfig, LocalProcessBackendConfig] +BackendCfg = Union[KubernetesBackendConfig, LocalProcessBackendConfig] class TrainerClient: def __init__( self, - backend_config: Optional[BackendCfg] = K8SBackendConfig() + backend_config: Optional[BackendCfg] = KubernetesBackendConfig() ): """Initialize a Kubeflow Trainer client. Args: - backend_config: Backend configuration. Either K8SBackendConfig or + backend_config: Backend configuration. Either KubernetesBackendConfig or LocalProcessBackendConfig, or None to use the backend's - default config class. Defaults to K8SBackendConfig. + default config class. Defaults to KubernetesBackendConfig. """ # initialize training backend - self.__backend = self.__init_backend(backend_config) - - def __init_backend(self, backend_config: BackendCfg): - backend = TRAINER_BACKENDS.get(backend_config.__class__) - if not backend: + self.__backend = None + if isinstance(backend_config, KubernetesBackendConfig): + self.__backend = KubernetesBackend(backend_config) + elif isinstance(backend_config, LocalProcessBackendConfig): + self.__backend = LocalProcessBackend(backend_config) + else: raise ValueError("Invalid backend config '{}'".format(backend_config)) - # initialize the backend class with the user provided config - return backend(cfg=backend_config) def list_runtimes(self) -> types.RuntimeList: """List of the available Runtimes. diff --git a/kubeflow/trainer/api/trainer_client_test.py b/kubeflow/trainer/api/trainer_client_test.py index 3ea964707..dc0736c95 100644 --- a/kubeflow/trainer/api/trainer_client_test.py +++ b/kubeflow/trainer/api/trainer_client_test.py @@ -28,10 +28,10 @@ from unittest.mock import Mock, patch import pytest -from kubeflow.trainer.backends import K8SBackend +from kubeflow.trainer.backends.kubernetes.k8s import KubernetesBackend from kubeflow.trainer.constants import constants from kubeflow.trainer.types import types -from kubeflow.trainer.types.backends import K8SBackendConfig +from kubeflow.trainer.backends.kubernetes.types import KubernetesBackendConfig from kubeflow.trainer.utils import utils from kubeflow_trainer_api import models @@ -101,7 +101,7 @@ def trainer_client(request): read_namespaced_pod_log=Mock(side_effect=mock_read_namespaced_pod_log), ), ): - yield K8SBackend(K8SBackendConfig()) + yield KubernetesBackend(KubernetesBackendConfig()) # -------------------------- diff --git a/kubeflow/trainer/backends/__init__.py b/kubeflow/trainer/backends/__init__.py index 5d21e30b9..e69de29bb 100644 --- a/kubeflow/trainer/backends/__init__.py +++ b/kubeflow/trainer/backends/__init__.py @@ -1,26 +0,0 @@ -# Copyright 2025 The Kubeflow Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from kubeflow.trainer.backends.k8s import K8SBackend -from kubeflow.trainer.backends.local_process import LocalProcessBackend -from kubeflow.trainer.types.backends import K8SBackendConfig, LocalProcessBackendConfig - - -TRAINER_BACKENDS = { - cfg_class:backend_class for cfg_class, backend_class in - zip( - [K8SBackendConfig, LocalProcessBackendConfig], - [K8SBackend, LocalProcessBackend] - ) -} diff --git a/kubeflow/trainer/local/__init__.py b/kubeflow/trainer/backends/kubernetes/__init__.py similarity index 100% rename from kubeflow/trainer/local/__init__.py rename to kubeflow/trainer/backends/kubernetes/__init__.py diff --git a/kubeflow/trainer/backends/k8s.py b/kubeflow/trainer/backends/kubernetes/k8s.py similarity index 99% rename from kubeflow/trainer/backends/k8s.py rename to kubeflow/trainer/backends/kubernetes/k8s.py index d956fb874..285417faa 100644 --- a/kubeflow/trainer/backends/k8s.py +++ b/kubeflow/trainer/backends/kubernetes/k8s.py @@ -28,7 +28,7 @@ from kubeflow.trainer.constants import constants from kubeflow.trainer.types import types -from kubeflow.trainer.types.backends import K8SBackendConfig +from kubeflow.trainer.backends.kubernetes.types import KubernetesBackendConfig from kubeflow.trainer.utils import utils from kubeflow.trainer.backends import base @@ -36,10 +36,10 @@ logger = logging.getLogger(__name__) -class K8SBackend(base.TrainingBackend): +class KubernetesBackend(base.TrainingBackend): def __init__(self, - cfg: K8SBackendConfig, + cfg: KubernetesBackendConfig, ): if cfg.namespace is None: cfg.namespace = utils.get_default_target_namespace(cfg.context) diff --git a/kubeflow/trainer/types/backends.py b/kubeflow/trainer/backends/kubernetes/types.py similarity index 83% rename from kubeflow/trainer/types/backends.py rename to kubeflow/trainer/backends/kubernetes/types.py index 463d223be..75ade1f24 100644 --- a/kubeflow/trainer/types/backends.py +++ b/kubeflow/trainer/backends/kubernetes/types.py @@ -13,14 +13,13 @@ # limitations under the License. from typing import Optional - -from pydantic import BaseModel from kubernetes import client -class BackendConfig(BaseModel): - pass +from kubeflow.trainer.types import types + + -class K8SBackendConfig(BackendConfig): +class KubernetesBackendConfig(types.BackendConfig): namespace: Optional[str] = None config_file: Optional[str] = None context: Optional[str] = None @@ -28,7 +27,3 @@ class K8SBackendConfig(BackendConfig): class Config: arbitrary_types_allowed = True - -class LocalProcessBackendConfig(BackendConfig): - pass - diff --git a/kubeflow/trainer/backends/local/__init__.py b/kubeflow/trainer/backends/local/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/kubeflow/trainer/local/job.py b/kubeflow/trainer/backends/local/job.py similarity index 98% rename from kubeflow/trainer/local/job.py rename to kubeflow/trainer/backends/local/job.py index 0f9e1ffb7..4ce92814e 100644 --- a/kubeflow/trainer/local/job.py +++ b/kubeflow/trainer/backends/local/job.py @@ -18,7 +18,7 @@ from datetime import datetime from typing import List, Union -from kubeflow.trainer.local import resource_manager +from kubeflow.trainer.backends.local import resource_manager logger = logging.getLogger(__name__) diff --git a/kubeflow/trainer/backends/local_process.py b/kubeflow/trainer/backends/local/local_process.py similarity index 93% rename from kubeflow/trainer/backends/local_process.py rename to kubeflow/trainer/backends/local/local_process.py index eddd50856..fc1e698e5 100644 --- a/kubeflow/trainer/backends/local_process.py +++ b/kubeflow/trainer/backends/local/local_process.py @@ -21,11 +21,11 @@ from kubeflow.trainer.constants import constants from kubeflow.trainer.types import types -from kubeflow.trainer.types.backends import LocalProcessBackendConfig -from kubeflow.trainer.utils import local as local_utils +from kubeflow.trainer.backends.local.types import LocalProcessBackendConfig +from kubeflow.trainer.backends.local import utils from kubeflow.trainer.backends import base -from kubeflow.trainer.local import runtimes as local_runtimes -from kubeflow.trainer.local import job +from kubeflow.trainer.backends.local import runtimes as local_runtimes +from kubeflow.trainer.backends.local import job logger = logging.getLogger(__name__) @@ -73,9 +73,9 @@ def train(self, # create venv if runtime.create_venv: self.__create_venv(env_dir=target_dir) - runtime.python_path = local_utils.get_venv_python_path(target_dir) + runtime.python_path = utils.get_venv_python_path(target_dir) - command, args = local_utils.build_local_training_executable( + command, args = utils.build_local_training_executable( runtime, trainer.func, trainer.func_args, diff --git a/kubeflow/trainer/local/resource_manager.py b/kubeflow/trainer/backends/local/resource_manager.py similarity index 100% rename from kubeflow/trainer/local/resource_manager.py rename to kubeflow/trainer/backends/local/resource_manager.py diff --git a/kubeflow/trainer/local/runtimes.py b/kubeflow/trainer/backends/local/runtimes.py similarity index 100% rename from kubeflow/trainer/local/runtimes.py rename to kubeflow/trainer/backends/local/runtimes.py diff --git a/kubeflow/trainer/backends/local/types.py b/kubeflow/trainer/backends/local/types.py new file mode 100644 index 000000000..8061032e4 --- /dev/null +++ b/kubeflow/trainer/backends/local/types.py @@ -0,0 +1,18 @@ +# Copyright 2025 The Kubeflow Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from kubeflow.trainer.types import types + +class LocalProcessBackendConfig(types.BackendConfig): + pass diff --git a/kubeflow/trainer/utils/local.py b/kubeflow/trainer/backends/local/utils.py similarity index 100% rename from kubeflow/trainer/utils/local.py rename to kubeflow/trainer/backends/local/utils.py diff --git a/kubeflow/trainer/types/types.py b/kubeflow/trainer/types/types.py index 3ed2e512b..16f7b82a7 100644 --- a/kubeflow/trainer/types/types.py +++ b/kubeflow/trainer/types/types.py @@ -20,9 +20,10 @@ from datetime import datetime from enum import Enum from typing import Callable, Dict, Optional, List, Union +from pydantic import BaseModel from kubeflow.trainer.constants import constants -from kubeflow.trainer.local.job import LocalJob +from kubeflow.trainer.backends.local.job import LocalJob # Configuration for the Custom Trainer. @@ -280,8 +281,12 @@ class LocalTrainJob(TrainJob): # Training Backends Types +class BackendConfig(BaseModel): + pass + # this can be simplified if we drop python3.9 support as follows RuntimeList = Union[List[Runtime], List[LocalRuntime]] TrainingRuntime = Union[Runtime, LocalRuntime] TrainJobLike = Union[TrainJob, LocalTrainJob] + From 1dbc3e980204a26bfe338ba390c132f04a63668a Mon Sep 17 00:00:00 2001 From: Saad Zaher Date: Fri, 22 Aug 2025 12:39:56 +0100 Subject: [PATCH 16/33] Update kubeflow/trainer/api/trainer_client.py Co-authored-by: Andrey Velichkevich Signed-off-by: Saad Zaher --- kubeflow/trainer/api/trainer_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kubeflow/trainer/api/trainer_client.py b/kubeflow/trainer/api/trainer_client.py index cd8f21cc0..9293f1910 100644 --- a/kubeflow/trainer/api/trainer_client.py +++ b/kubeflow/trainer/api/trainer_client.py @@ -1,4 +1,4 @@ -# Copyright 2024-2025 The Kubeflow Authors. +# Copyright 2024 The Kubeflow Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 46a5fd7a67c104f61bb34196ecb3ac7f0cba5410 Mon Sep 17 00:00:00 2001 From: Saad Zaher Date: Sun, 7 Sep 2025 02:48:18 +0100 Subject: [PATCH 17/33] add get_runtime_packages as not supported by local-exec Signed-off-by: Saad Zaher --- kubeflow/trainer/api/trainer_client.py | 8 +- .../trainer/backends/localprocess/backend.py | 113 ++++++------ kubeflow/trainer/backends/localprocess/job.py | 21 ++- .../backends/localprocess/resource_manager.py | 164 ------------------ .../trainer/backends/localprocess/runtimes.py | 10 +- .../trainer/backends/localprocess/types.py | 2 + .../trainer/backends/localprocess/utils.py | 31 ++-- kubeflow/trainer/constants/constants.py | 3 - 8 files changed, 99 insertions(+), 253 deletions(-) delete mode 100644 kubeflow/trainer/backends/localprocess/resource_manager.py diff --git a/kubeflow/trainer/api/trainer_client.py b/kubeflow/trainer/api/trainer_client.py index c57ff7117..a61c8084b 100644 --- a/kubeflow/trainer/api/trainer_client.py +++ b/kubeflow/trainer/api/trainer_client.py @@ -19,6 +19,8 @@ from kubeflow.trainer.types import types from kubeflow.trainer.backends.kubernetes.backend import KubernetesBackend from kubeflow.trainer.backends.kubernetes.types import KubernetesBackendConfig +from kubeflow.trainer.backends.localprocess.backend import LocalProcessBackend +from kubeflow.trainer.backends.localprocess.backend import LocalProcessBackendConfig logger = logging.getLogger(__name__) @@ -27,7 +29,9 @@ class TrainerClient: def __init__( self, - backend_config: KubernetesBackendConfig = KubernetesBackendConfig(), + backend_config: Union[ + KubernetesBackendConfig, LocalProcessBackendConfig + ] = KubernetesBackendConfig(), ): """Initialize a Kubeflow Trainer client. @@ -43,6 +47,8 @@ def __init__( # initialize training backend if isinstance(backend_config, KubernetesBackendConfig): self.backend = KubernetesBackend(backend_config) + elif isinstance(backend_config, LocalProcessBackendConfig): + self.backend = LocalProcessBackend(backend_config) else: raise ValueError("Invalid backend config '{}'".format(backend_config)) diff --git a/kubeflow/trainer/backends/localprocess/backend.py b/kubeflow/trainer/backends/localprocess/backend.py index 77697e599..961183bec 100644 --- a/kubeflow/trainer/backends/localprocess/backend.py +++ b/kubeflow/trainer/backends/localprocess/backend.py @@ -20,13 +20,15 @@ import random from datetime import datetime from pathlib import Path -from typing import List, Optional, Set, Dict, Union +from typing import List, Optional, Set, Union, Iterator from kubeflow.trainer.constants import constants from kubeflow.trainer.types import types from kubeflow.trainer.backends.base import ExecutionBackend from kubeflow.trainer.backends.localprocess.types import ( - LocalProcessBackendConfig, LocalBackendJobs, LocalBackendStep + LocalProcessBackendConfig, + LocalBackendJobs, + LocalBackendStep, ) from kubeflow.trainer.backends.localprocess.runtimes import local_runtimes from kubeflow.trainer.backends.localprocess.job import LocalJob @@ -36,34 +38,33 @@ class LocalProcessBackend(ExecutionBackend): - - def __init__(self, - cfg: LocalProcessBackendConfig, - ): + def __init__( + self, + cfg: LocalProcessBackendConfig, + ): # list of running subprocesses self.__local_jobs: List[LocalBackendJobs] = [] self.cfg = cfg - def list_runtimes(self) -> List[types.Runtime]: - return [local_runtime.runtime for local_runtime in local_runtimes] def get_runtime(self, name: str) -> Optional[types.Runtime]: - _runtime = [rt.runtime for rt in local_runtimes if rt.runtime.name == name] if not _runtime: raise ValueError(f"Runtime '{name}' not found.") return _runtime[0] + def get_runtime_packages(self, runtime: types.Runtime): + raise NotImplementedError("get_runtime_packages is not supported by LocalProcessBackend") + def train( self, runtime: Optional[types.Runtime] = None, initializer: Optional[types.Initializer] = None, trainer: Optional[Union[types.CustomTrainer, types.BuiltinTrainer]] = None, ) -> str: - train_job_name = "kft-{}".format( random.choice(string.ascii_lowercase) + uuid.uuid4().hex[:11], ) @@ -81,11 +82,11 @@ def train( local_runtime = self.__get_full_runtime(runtime) - runtime.trainer = local_utils.get_runtime_trainer( + runtime.trainer = local_utils.get_runtime_trainer( venv_dir=target_dir, python_bin=str(python_bin), framework=runtime.trainer.framework, - ml_policy=local_runtime.ml_policy + ml_policy=local_runtime.ml_policy, ) training_command = [] @@ -98,8 +99,9 @@ def train( deps_command = local_utils.get_dependencies_command( python_bin=python_bin, pip_bin=str(pip_bin), - pip_index_urls=trainer.pip_index_urls if trainer.pip_index_urls else - constants.DEFAULT_PIP_INDEX_URLS, + pip_index_urls=trainer.pip_index_urls + if trainer.pip_index_urls + else constants.DEFAULT_PIP_INDEX_URLS, packages=trainer.packages_to_install, ) training_command = local_utils.get_command_using_train_func( @@ -120,7 +122,7 @@ def train( command=deps_command, debug=self.cfg.debug, execution_dir=target_dir, - env=trainer.env + env=trainer.env, ) deps_job.start() # make sure training doesn't start before dependencies installation finish @@ -141,7 +143,7 @@ def train( cleanup_dependencies.append(train_job) self.__register_job(train_job_name, "train", train_job) - + # if cleanup is requested. The virtualenv dir will be deleted. if self.cfg.cleanup: cleanup_command = local_utils.get_cleanup_command(venv_dir=target_dir) cleanup_job = LocalJob( @@ -157,13 +159,13 @@ def train( return train_job_name - def list_jobs(self, runtime: Optional[types.Runtime] = None) -> List[types.TrainJob]: - result = [ types.TrainJob( - name=j.name, creation_timestamp=j.created, - runtime=runtime, num_nodes=1, + name=j.name, + creation_timestamp=j.created, + runtime=runtime, + num_nodes=1, steps=[ types.Step(name=s.step_name, pod_name=s.step_name, status=s.job.status) for s in j.steps @@ -174,55 +176,56 @@ def list_jobs(self, runtime: Optional[types.Runtime] = None) -> List[types.Train return result - def get_job(self, name: str) -> Optional[types.TrainJob]: - j = [j for j in self.__local_jobs if j.name == name] - if not j: + _job = next((j for j in self.__local_jobs if j.name == name), None) + if _job is None: raise ValueError("No TrainJob with name '%s'" % name) - # get job status - status = self.__get_job_status(j[0]) + # check and set the correct job status to match `TrainerClient` supported statuses + status = self.__get_job_status(_job[0]) return types.TrainJob( - name=j[0].name, - creation_timestamp=j[0].created, + name=_job[0].name, + creation_timestamp=_job[0].created, steps=[ - types.Step(name=s.step_name, pod_name=s.step_name, status=s.job.status) - for s in j[0].steps + types.Step(name=_step.step_name, pod_name=_step.step_name, status=_step.job.status) + for _step in _job[0].steps ], - runtime=None, num_nodes=1, status=status, + runtime=None, + num_nodes=1, + status=status, ) - def get_job_logs(self, - name: str, - follow: Optional[bool] = False, - step: str = constants.NODE, - node_rank: int = 0) -> Dict[str, str]: + def get_job_logs( + self, + name: str, + follow: Optional[bool] = False, + step: str = constants.NODE + "-0", + node_rank: int = 0, + ) -> Iterator[str]: _job = [j for j in self.__local_jobs if j.name == name] if not _job: raise ValueError("No TrainJob with name '%s'" % name) - _job = _job[0] - - logs_dict = {} - for _step in _job.steps: - if step != constants.NODE and _step.step_name == step: - logs_dict[_step.step_name] = "".join(_step.job.logs(follow=follow)) - else: - logs_dict[_step.step_name] = _step.job.logs(follow=follow) + want_all_steps = step == constants.NODE + "-0" - return logs_dict + for _step in _job[0].steps: + if not want_all_steps and _step.step_name != step: + continue + # Flatten the generator and pass through flags so it behaves as expected + # (adjust args if stream_logs has different signature) + yield from _step.job.logs(follow=follow) def delete_job(self, name: str): - # delete job from registry or job list - target = [j for j in self.__local_jobs if j.name == name] - if not target: - + # find job first. + _job = next((j for j in self.__local_jobs if j.name == name), None) + if _job is None: raise ValueError("No TrainJob with name '%s'" % name) + # cancel all nested step jobs in target job - _ = [step.job.cancel() for step in target[0].steps] + _ = [step.job.cancel() for step in _job[0].steps] # remove the job from the list of jobs - self.__local_jobs.remove(target[0]) + self.__local_jobs.remove(_job[0]) def wait_for_job_status( self, @@ -231,18 +234,18 @@ def wait_for_job_status( timeout: int = 600, polling_interval: int = 2, ) -> types.TrainJob: + # find first match or fallback + _job = next((_job for _job in self.__local_jobs if _job.name == name), None) - local_job = [_job for _job in self.__local_jobs if _job.name == name] - if not local_job: + if _job is None: raise ValueError("No TrainJob with name '%s'" % name) - local_job = local_job[0] - for _step in local_job.steps: + # find a better implementation for this + for _step in _job.steps: if _step.status in [constants.TRAINJOB_RUNNING, constants.TRAINJOB_CREATED]: _step.job.join(timeout=timeout) return self.get_job(name) def __setup_runtime(self, train_job_name): - target_dir = tempfile.mkdtemp(prefix=f"{train_job_name}-") venv.create(env_dir=target_dir, with_pip=False) diff --git a/kubeflow/trainer/backends/localprocess/job.py b/kubeflow/trainer/backends/localprocess/job.py index 288b289d8..4c593fbec 100644 --- a/kubeflow/trainer/backends/localprocess/job.py +++ b/kubeflow/trainer/backends/localprocess/job.py @@ -25,8 +25,13 @@ class LocalJob(threading.Thread): def __init__( - self, name, command: Union[List, Tuple[str], str], execution_dir:str = None, - debug: bool=False, env: Dict[str, str] = None, dependencies: List = None, + self, + name, + command: Union[List, Tuple[str], str], + execution_dir: str = None, + debug: bool = False, + env: Dict[str, str] = None, + dependencies: List = None, ): """Create a LocalJob. Create a local subprocess with threading to allow users to create background jobs. @@ -112,12 +117,14 @@ def run(self): self._process.stdout.close() self._returncode = self._process.wait() self._end_time = datetime.now() - self._success = (self._process.returncode == 0) - msg = (f"[{self.name}] Completed with code {self._returncode}" - f" in {self._end_time - self._start_time} seconds.") + self._success = self._process.returncode == 0 + msg = ( + f"[{self.name}] Completed with code {self._returncode}" + f" in {self._end_time - self._start_time} seconds." + ) # set status based on success or failure - self._status = constants.TRAINJOB_COMPLETE if self._success else ( - constants.TRAINJOB_FAILED + self._status = ( + constants.TRAINJOB_COMPLETE if self._success else (constants.TRAINJOB_FAILED) ) self._stdout += msg if self.debug: diff --git a/kubeflow/trainer/backends/localprocess/resource_manager.py b/kubeflow/trainer/backends/localprocess/resource_manager.py deleted file mode 100644 index f2271994f..000000000 --- a/kubeflow/trainer/backends/localprocess/resource_manager.py +++ /dev/null @@ -1,164 +0,0 @@ -# Copyright 2025 The Kubeflow Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -""" -spawn_limited.py - -Cross-platform subprocess launcher that can: - • cap memory (bytes) - • cap total CPU-seconds - • pin to specific CPU cores - • lower priority (nice value) -""" -# POSIX‐only imports; will be skipped on Windows -import re -try: - import resource - import os -except ImportError: - resource = None - os = None - - -def _parse_mem_limit(limit): - """ - Parse a k8s-style memory string into bytes. - Accepts: - - int or float (interpreted as bytes) - - str: "", where suffix is one of: - K, KB, Ki, KiB, M, MB, Mi, MiB, G, GB, Gi, GiB, ... - Allows decimal or binary prefixes and optional 'B'. - """ - if isinstance(limit, (int, float)): - return int(limit) - s = limit.upper().strip() - m = re.fullmatch(r"(\d+(\.\d+)?)([KMGTPEmgtpe][iI]?[bB]?)?", s) - if not m: - raise ValueError(f"Invalid memory limit format: {limit!r}") - number = float(m.group(1)) - unit = (m.group(3) or "").upper() - - # mapping suffix → multiplier - mult = { - "K": 10**3, "KB": 10**3, - "KI": 2**10, "KIB": 2**10, - "M": 10**6, "MB": 10**6, - "MI": 2**20, "MIB": 2**20, - "G": 10**9, "GB": 10**9, - "GI": 2**30, "GIB": 2**30, - "T": 10**12, "TB": 10**12, - "TI": 2**40, "TIB": 2**40, - "P": 10**15, "PB": 10**15, - "PI": 2**50, "PIB": 2**50, - "E": 10**18, "EB": 10**18, - "EI": 2**60, "EIB": 2**60, - "": 1, - } - if unit not in mult: - raise ValueError(f"Unknown memory unit: {unit!r}") - return int(number * mult[unit]) - - -def _parse_cpu_limit(limit): - """ - Parse a k8s-style CPU string into a float number of CPUs. - - "250m" ⇒ 0.25 - - "1" ⇒ 1.0 - - 2 ⇒ 2.0 - """ - if isinstance(limit, (int, float)): - return float(limit) - s = limit.strip() - m = re.fullmatch(r"(\d+(\.\d+)?)(m?)", s, re.IGNORECASE) - if not m: - raise ValueError(f"Invalid CPU limit: {limit!r}") - number = float(m.group(1)) - if m.group(3).lower() == "m": - return number / 1000.0 - return number - - -def _limit_resources_posix(mem_limit=None, cpu_time=None, cpu_cores=None, nice=0): - """ - Called in the child (via preexec_fn) on Linux/macOS. - - mem_limit: int bytes or k8s-style str (e.g. "1Gi", "512M", "8G") - - cpu_time: int seconds; maximum CPU-seconds before SIGXCPU - - cpu_cores: iterable of core indices (Linux only) - - nice: added niceness - """ - # 1) Lower priority - if nice and os: - os.nice(nice) - - # 2) Memory limit - if mem_limit is not None and resource: - try: - bytes_limit = _parse_mem_limit(mem_limit) - resource.setrlimit(resource.RLIMIT_AS, (bytes_limit, bytes_limit)) - except Exception: - # Some platforms (e.g. macOS) may ignore or reject RLIMIT_AS - pass - - # 3) CPU‐seconds limit - if cpu_time and resource: - resource.setrlimit(resource.RLIMIT_CPU, (cpu_time, cpu_time)) - - # 4) CPU‐core affinity (Linux only) - if cpu_cores and hasattr(os, "sched_setaffinity"): - os.sched_setaffinity(0, set(cpu_cores)) - - -def setup_local_process(mem_limit=None, cpu_limit=None, cpu_time=None, nice=0): - """ - Set up the subporcess pre-exec function to set resource limits, ...etc. - - Arguments: - mem_limit – int/float bytes or k8s-style string ("8Gi", "512Mi", etc.) - cpu – k8s-style CPU string or number ("750m", "2", etc.) - *fractional CPUs are parsed but ignored; only whole cores used* - cpu_time – int seconds; maximum CPU-seconds before SIGXCPU - nice – int; niceness increment - - Returns: - Callable preexec_fn function that gets executed after the fork() and before exec() - """ - preexec = None - # --- Parse memory --- - mem_bytes = None - if mem_limit is not None: - mem_bytes = _parse_mem_limit(mem_limit) - - # --- Parse CPU and select cores --- - cpu_cores = None - if cpu_limit is not None: - cpu_units = _parse_cpu_limit(cpu_limit) - total_cores = os.cpu_count() or 1 - # Only the integer part is used for affinity - n_cores = min(int(cpu_units), total_cores) - cpu_cores = list(range(n_cores)) if n_cores > 0 else [] - - # --- Build preexec_fn for POSIX --- - preexec = None - if mem_bytes is not None or cpu_cores is not None or nice: - def preexec_fn(): - return _limit_resources_posix( - mem_limit=mem_bytes, - cpu_time=cpu_time, - cpu_cores=cpu_cores, - nice=nice, - ) - preexec = preexec_fn - - return preexec diff --git a/kubeflow/trainer/backends/localprocess/runtimes.py b/kubeflow/trainer/backends/localprocess/runtimes.py index 52badbfd3..d2b30fd44 100644 --- a/kubeflow/trainer/backends/localprocess/runtimes.py +++ b/kubeflow/trainer/backends/localprocess/runtimes.py @@ -14,13 +14,13 @@ from kubeflow_trainer_api.models.trainer_v1alpha1_ml_policy import TrainerV1alpha1MLPolicy from kubeflow_trainer_api.models.trainer_v1alpha1_torch_ml_policy_source import ( - TrainerV1alpha1TorchMLPolicySource + TrainerV1alpha1TorchMLPolicySource, ) from kubeflow_trainer_api.models.trainer_v1alpha1_torch_elastic_policy import ( - TrainerV1alpha1TorchElasticPolicy + TrainerV1alpha1TorchElasticPolicy, ) from kubeflow_trainer_api.models.io_k8s_apimachinery_pkg_util_intstr_int_or_string import ( - IoK8sApimachineryPkgUtilIntstrIntOrString + IoK8sApimachineryPkgUtilIntstrIntOrString, ) from kubeflow.trainer.types import types as base_types from kubeflow.trainer.constants import constants @@ -36,7 +36,7 @@ num_nodes=1, device=constants.UNKNOWN, device_count=constants.UNKNOWN, - ) + ), ), ml_policy=TrainerV1alpha1MLPolicy( torch=TrainerV1alpha1TorchMLPolicySource( @@ -45,6 +45,6 @@ ), numProcPerNode=IoK8sApimachineryPkgUtilIntstrIntOrString(1), ) - ) + ), ) ] diff --git a/kubeflow/trainer/backends/localprocess/types.py b/kubeflow/trainer/backends/localprocess/types.py index 2fae21e7b..12697b398 100644 --- a/kubeflow/trainer/backends/localprocess/types.py +++ b/kubeflow/trainer/backends/localprocess/types.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + import typing from datetime import datetime from typing import List, Optional @@ -44,6 +45,7 @@ class LocalBackendStep(BaseModel): class Config: arbitrary_types_allowed = True + class LocalBackendJobs(BaseModel): steps: Optional[List[LocalBackendStep]] = [] name: str diff --git a/kubeflow/trainer/backends/localprocess/utils.py b/kubeflow/trainer/backends/localprocess/utils.py index 7b5b395dc..1859c4527 100644 --- a/kubeflow/trainer/backends/localprocess/utils.py +++ b/kubeflow/trainer/backends/localprocess/utils.py @@ -13,10 +13,10 @@ def get_runtime_trainer( - venv_dir: str, - python_bin: str, - framework: str, - ml_policy: models.TrainerV1alpha1MLPolicy, + venv_dir: str, + python_bin: str, + framework: str, + ml_policy: models.TrainerV1alpha1MLPolicy, ) -> types.RuntimeTrainer: """ Get the RuntimeTrainer object. @@ -59,11 +59,7 @@ def get_dependencies_command(python_bin, pip_bin: str, pip_index_urls: str, pack } t = Template(local_exec_constants.DEPENDENCIES_SCRIPT) result = t.substitute(**mapping) - return ( - 'bash', - '-c', - result - ) + return ("bash", "-c", result) def get_local_devices(resources: dict[str, str]) -> (str, int): @@ -82,11 +78,11 @@ def get_local_devices(resources: dict[str, str]) -> (str, int): def get_command_using_train_func( - runtime: types.Runtime, - train_func: Callable, - train_func_parameters: Optional[Dict[str, Any]], - venv_dir: str, - train_job_name: str, + runtime: types.Runtime, + train_func: Callable, + train_func_parameters: Optional[Dict[str, Any]], + venv_dir: str, + train_job_name: str, ) -> tuple: """ Get the Trainer container command from the given training function and parameters. @@ -106,8 +102,7 @@ def get_command_using_train_func( # Extract the file name where the function is defined and move it the venv directory. func_file = Path(venv_dir) / "{}-{}".format( - train_job_name, - os.path.basename(inspect.getfile(train_func)) + train_job_name, os.path.basename(inspect.getfile(train_func)) ) # Function might be defined in some indented scope (e.g. in another function). @@ -137,7 +132,7 @@ def get_command_using_train_func( } command = t.safe_substitute(**mapping) - return 'bash', '-c', command + return "bash", "-c", command def get_cleanup_command(venv_dir: str) -> tuple: @@ -145,4 +140,4 @@ def get_cleanup_command(venv_dir: str) -> tuple: t = Template(local_exec_constants.LOCAL_EXEC_JOB_CLEANUP_SCRIPT) cleanup_command = t.substitute(**mapping) - return 'bash', '-c', cleanup_command + return "bash", "-c", cleanup_command diff --git a/kubeflow/trainer/constants/constants.py b/kubeflow/trainer/constants/constants.py index c219650e5..7e29f5920 100644 --- a/kubeflow/trainer/constants/constants.py +++ b/kubeflow/trainer/constants/constants.py @@ -101,9 +101,6 @@ # The label for TPU in the container resources. TPU_LABEL = "google.com/tpu" -# The label for MPS in the local and container resources. -MPS_LABEL = "apple.com/mps" - # The label key to identify the JobSet name of the Pod. JOBSET_NAME_LABEL = "jobset.sigs.k8s.io/jobset-name" From ea3e9cfc8859cbf7530aa6c37db8b1e79f8cf89f Mon Sep 17 00:00:00 2001 From: Saad Zaher Date: Sun, 7 Sep 2025 02:56:34 +0100 Subject: [PATCH 18/33] move backends and its configs to kubeflow.trainer Signed-off-by: Saad Zaher --- kubeflow/trainer/__init__.py | 11 +++++++++++ kubeflow/trainer/backends/__init__.py | 25 +++++++++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/kubeflow/trainer/__init__.py b/kubeflow/trainer/__init__.py index dce03bbb7..582eb71c6 100644 --- a/kubeflow/trainer/__init__.py +++ b/kubeflow/trainer/__init__.py @@ -38,6 +38,13 @@ TrainerType, ) +# import backends and its associated configs +from kubeflow.trainer.backends.kubernetes.backend import KubernetesBackend +from kubeflow.trainer.backends.kubernetes.types import KubernetesBackendConfig +from kubeflow.trainer.backends.localprocess.backend import LocalProcessBackend +from kubeflow.trainer.backends.localprocess.types import LocalProcessBackendConfig + + __all__ = [ "BuiltinTrainer", "CustomTrainer", @@ -55,4 +62,8 @@ "RuntimeTrainer", "TrainerClient", "TrainerType", + "KubernetesBackend", + "LocalProcessBackend", + "LocalProcessBackendConfig", + "KubernetesBackendConfig", ] diff --git a/kubeflow/trainer/backends/__init__.py b/kubeflow/trainer/backends/__init__.py index e69de29bb..3ccba2d53 100644 --- a/kubeflow/trainer/backends/__init__.py +++ b/kubeflow/trainer/backends/__init__.py @@ -0,0 +1,25 @@ +# Copyright 2025 The Kubeflow Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from kubeflow.trainer.backends.kubernetes.backend import KubernetesBackend +from kubeflow.trainer.backends.kubernetes.types import KubernetesBackendConfig +from kubeflow.trainer.backends.localprocess.backend import LocalProcessBackend +from kubeflow.trainer.backends.localprocess.types import LocalProcessBackendConfig + +__all__ = [ + "KubernetesBackend", + "LocalProcessBackend", + "LocalProcessBackendConfig", + "KubernetesBackendConfig", +] From 2976f8a228426459159e75718aa21e34560afcd0 Mon Sep 17 00:00:00 2001 From: Saad Zaher Date: Mon, 8 Sep 2025 11:05:40 +0100 Subject: [PATCH 19/33] fix typo in delete_job Signed-off-by: Saad Zaher --- kubeflow/trainer/backends/localprocess/backend.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kubeflow/trainer/backends/localprocess/backend.py b/kubeflow/trainer/backends/localprocess/backend.py index 961183bec..ffd31149a 100644 --- a/kubeflow/trainer/backends/localprocess/backend.py +++ b/kubeflow/trainer/backends/localprocess/backend.py @@ -223,9 +223,9 @@ def delete_job(self, name: str): raise ValueError("No TrainJob with name '%s'" % name) # cancel all nested step jobs in target job - _ = [step.job.cancel() for step in _job[0].steps] + _ = [step.job.cancel() for step in _job.steps] # remove the job from the list of jobs - self.__local_jobs.remove(_job[0]) + self.__local_jobs.remove(_job) def wait_for_job_status( self, From e4a57b38bf6ba59e520703de94b3c1e022dda3ee Mon Sep 17 00:00:00 2001 From: Saad Zaher Date: Mon, 8 Sep 2025 12:46:32 +0100 Subject: [PATCH 20/33] Move local_runtimes to constants * Move local_runtimes to constants * allow list_jobs to filter by runtime * keep runtime ref in __local_jobs Signed-off-by: Saad Zaher --- .../trainer/backends/localprocess/backend.py | 95 ++++++++++++------- .../{runtimes.py => constants.py} | 31 ++++++ .../trainer/backends/localprocess/types.py | 1 + .../trainer/backends/localprocess/utils.py | 21 +--- .../trainer/constants/local_exec_constants.py | 45 --------- 5 files changed, 93 insertions(+), 100 deletions(-) rename kubeflow/trainer/backends/localprocess/{runtimes.py => constants.py} (71%) delete mode 100644 kubeflow/trainer/constants/local_exec_constants.py diff --git a/kubeflow/trainer/backends/localprocess/backend.py b/kubeflow/trainer/backends/localprocess/backend.py index ffd31149a..f94c646cf 100644 --- a/kubeflow/trainer/backends/localprocess/backend.py +++ b/kubeflow/trainer/backends/localprocess/backend.py @@ -29,8 +29,9 @@ LocalProcessBackendConfig, LocalBackendJobs, LocalBackendStep, + LocalRuntime, ) -from kubeflow.trainer.backends.localprocess.runtimes import local_runtimes +from kubeflow.trainer.backends.localprocess.constants import local_runtimes from kubeflow.trainer.backends.localprocess.job import LocalJob from kubeflow.trainer.backends.localprocess import utils as local_utils @@ -127,7 +128,12 @@ def train( deps_job.start() # make sure training doesn't start before dependencies installation finish training_dependencies.append(deps_job) - self.__register_job(train_job_name, "deps", deps_job) + self.__register_job( + train_job_name=train_job_name, + step_name="deps", + job=deps_job, + runtime=local_runtime, + ) if training_command: train_job = LocalJob( @@ -141,7 +147,12 @@ def train( train_job.start() # ask cleanup job to wait for training to be completed. cleanup_dependencies.append(train_job) - self.__register_job(train_job_name, "train", train_job) + self.__register_job( + train_job_name=train_job_name, + step_name="train", + job=train_job, + runtime=local_runtime, + ) # if cleanup is requested. The virtualenv dir will be deleted. if self.cfg.cleanup: @@ -155,25 +166,33 @@ def train( dependencies=cleanup_dependencies, ) cleanup_job.start() - self.__register_job(train_job_name, "cleanup", cleanup_job) + self.__register_job( + train_job_name=train_job_name, + step_name="cleanup", + job=cleanup_job, + runtime=local_runtime, + ) return train_job_name def list_jobs(self, runtime: Optional[types.Runtime] = None) -> List[types.TrainJob]: - result = [ - types.TrainJob( - name=j.name, - creation_timestamp=j.created, - runtime=runtime, - num_nodes=1, - steps=[ - types.Step(name=s.step_name, pod_name=s.step_name, status=s.job.status) - for s in j.steps - ], - ) - for j in self.__local_jobs - ] + result = [] + for _job in self.__local_jobs: + if runtime and _job.runtime.runtime.name != runtime.name: + continue + result.append( + types.TrainJob( + name=_job.name, + creation_timestamp=_job.created, + runtime=runtime, + num_nodes=1, + steps=[ + types.Step(name=s.step_name, pod_name=s.step_name, status=s.job.status) + for s in _job.steps + ], + ) + ) return result def get_job(self, name: str) -> Optional[types.TrainJob]: @@ -182,16 +201,16 @@ def get_job(self, name: str) -> Optional[types.TrainJob]: raise ValueError("No TrainJob with name '%s'" % name) # check and set the correct job status to match `TrainerClient` supported statuses - status = self.__get_job_status(_job[0]) + status = self.__get_job_status(_job) return types.TrainJob( - name=_job[0].name, - creation_timestamp=_job[0].created, + name=_job.name, + creation_timestamp=_job.created, steps=[ types.Step(name=_step.step_name, pod_name=_step.step_name, status=_step.job.status) - for _step in _job[0].steps + for _step in _job.steps ], - runtime=None, + runtime=_job.runtime.runtime, num_nodes=1, status=status, ) @@ -216,17 +235,6 @@ def get_job_logs( # (adjust args if stream_logs has different signature) yield from _step.job.logs(follow=follow) - def delete_job(self, name: str): - # find job first. - _job = next((j for j in self.__local_jobs if j.name == name), None) - if _job is None: - raise ValueError("No TrainJob with name '%s'" % name) - - # cancel all nested step jobs in target job - _ = [step.job.cancel() for step in _job.steps] - # remove the job from the list of jobs - self.__local_jobs.remove(_job) - def wait_for_job_status( self, name: str, @@ -245,6 +253,17 @@ def wait_for_job_status( _step.job.join(timeout=timeout) return self.get_job(name) + def delete_job(self, name: str): + # find job first. + _job = next((j for j in self.__local_jobs if j.name == name), None) + if _job is None: + raise ValueError("No TrainJob with name '%s'" % name) + + # cancel all nested step jobs in target job + _ = [step.job.cancel() for step in _job.steps] + # remove the job from the list of jobs + self.__local_jobs.remove(_job) + def __setup_runtime(self, train_job_name): target_dir = tempfile.mkdtemp(prefix=f"{train_job_name}-") venv.create(env_dir=target_dir, with_pip=False) @@ -276,10 +295,16 @@ def __get_job_status(self, job: LocalBackendJobs) -> str: return status - def __register_job(self, train_job_name, step_name, job): + def __register_job( + self, + train_job_name: str, + step_name: str, + job: LocalJob, + runtime: LocalRuntime = None, + ): _job = [j for j in self.__local_jobs if j.name == train_job_name] if not _job: - _job = LocalBackendJobs(name=train_job_name, created=datetime.now()) + _job = LocalBackendJobs(name=train_job_name, runtime=runtime, created=datetime.now()) self.__local_jobs.append(_job) else: _job = _job[0] diff --git a/kubeflow/trainer/backends/localprocess/runtimes.py b/kubeflow/trainer/backends/localprocess/constants.py similarity index 71% rename from kubeflow/trainer/backends/localprocess/runtimes.py rename to kubeflow/trainer/backends/localprocess/constants.py index d2b30fd44..edb75d155 100644 --- a/kubeflow/trainer/backends/localprocess/runtimes.py +++ b/kubeflow/trainer/backends/localprocess/constants.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import textwrap from kubeflow_trainer_api.models.trainer_v1alpha1_ml_policy import TrainerV1alpha1MLPolicy from kubeflow_trainer_api.models.trainer_v1alpha1_torch_ml_policy_source import ( TrainerV1alpha1TorchMLPolicySource, @@ -26,6 +27,7 @@ from kubeflow.trainer.constants import constants from kubeflow.trainer.backends.localprocess import types + local_runtimes = [ types.LocalRuntime( runtime=base_types.Runtime( @@ -48,3 +50,32 @@ ), ) ] + +# The exec script to embed training function into container command. +DEPENDENCIES_SCRIPT = textwrap.dedent( + """ + $PYTHON_BIN -m ensurepip --upgrade --default-pip + PIP_DISABLE_PIP_VERSION_CHECK=1 $PIP_BIN install --quiet \ + --no-warn-script-location $PIP_INDEX $PACKAGE_STR + """ +) + +# activate virtualenv, then run the entrypoint from the virtualenv bin +LOCAL_EXEC_JOB_SCRIPT = textwrap.dedent( + """ + source $PYENV_LOCATION/bin/activate + $ENTRYPOINT "$FUNC_FILE" "$PARAMETERS" + """ +) + +TORCH_COMMAND = "torchrun" + +# default command, will run from within the virtualenv +DEFAULT_COMMAND = "python" + +# remove virtualenv after training is completed. +LOCAL_EXEC_JOB_CLEANUP_SCRIPT = textwrap.dedent( + """ + rm -rf $PYENV_LOCATION + """ +) diff --git a/kubeflow/trainer/backends/localprocess/types.py b/kubeflow/trainer/backends/localprocess/types.py index 12697b398..b9631694d 100644 --- a/kubeflow/trainer/backends/localprocess/types.py +++ b/kubeflow/trainer/backends/localprocess/types.py @@ -48,6 +48,7 @@ class Config: class LocalBackendJobs(BaseModel): steps: Optional[List[LocalBackendStep]] = [] + runtime: Optional[LocalRuntime] = None name: str created: typing.Optional[datetime] = None completed: typing.Optional[datetime] = None diff --git a/kubeflow/trainer/backends/localprocess/utils.py b/kubeflow/trainer/backends/localprocess/utils.py index 1859c4527..fa5922576 100644 --- a/kubeflow/trainer/backends/localprocess/utils.py +++ b/kubeflow/trainer/backends/localprocess/utils.py @@ -7,8 +7,7 @@ from kubeflow_trainer_api import models -from kubeflow.trainer.constants import constants -from kubeflow.trainer.constants import local_exec_constants +from kubeflow.trainer.backends.localprocess import constants as local_exec_constants from kubeflow.trainer.types import types @@ -38,9 +37,6 @@ def get_runtime_trainer( if ml_policy.torch: _c = [os.path.join(venv_bin_dir, local_exec_constants.TORCH_COMMAND)] trainer.set_command(tuple(_c)) - elif ml_policy.mpi: - # mpi isn't supported yet - trainer.set_command(tuple(default_cmd)) else: trainer.set_command(tuple(default_cmd)) @@ -62,21 +58,6 @@ def get_dependencies_command(python_bin, pip_bin: str, pip_index_urls: str, pack return ("bash", "-c", result) -def get_local_devices(resources: dict[str, str]) -> (str, int): - device, device_count = constants.UNKNOWN, 0 - - if constants.GPU_LABEL in resources.items(): - device = constants.GPU_LABEL.split("/")[1] - device_count = resources[constants.GPU_LABEL] - elif constants.TPU_LABEL in resources.items(): - device = constants.TPU_LABEL.split("/")[1] - device_count = resources[constants.TPU_LABEL] - elif constants.MPS_LABEL in resources.items(): - device = constants.MPS_LABEL.split("/")[1] - device_count = resources[constants.MPS_LABEL] - return device, device_count - - def get_command_using_train_func( runtime: types.Runtime, train_func: Callable, diff --git a/kubeflow/trainer/constants/local_exec_constants.py b/kubeflow/trainer/constants/local_exec_constants.py deleted file mode 100644 index 9df6cbaac..000000000 --- a/kubeflow/trainer/constants/local_exec_constants.py +++ /dev/null @@ -1,45 +0,0 @@ -# Copyright 2025 The Kubeflow Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import textwrap - - -# The exec script to embed training function into container command. -DEPENDENCIES_SCRIPT = textwrap.dedent( - """ - $PYTHON_BIN -m ensurepip --upgrade --default-pip - PIP_DISABLE_PIP_VERSION_CHECK=1 $PIP_BIN install --quiet \ - --no-warn-script-location $PIP_INDEX $PACKAGE_STR - """ -) - -# activate virtualenv, then run the entrypoint from the virtualenv bin -LOCAL_EXEC_JOB_SCRIPT = textwrap.dedent( - """ - source $PYENV_LOCATION/bin/activate - $ENTRYPOINT "$FUNC_FILE" "$PARAMETERS" - """ -) - -TORCH_COMMAND = "torchrun" - -# default command, will run from within the virtualenv -DEFAULT_COMMAND = "python" - -# remove virtualenv after training is completed. -LOCAL_EXEC_JOB_CLEANUP_SCRIPT = textwrap.dedent( - """ - rm -rf $PYENV_LOCATION - """ -) From 8d6b1e7909f382cced3465d6ac0094d02c23aeae Mon Sep 17 00:00:00 2001 From: Saad Zaher Date: Mon, 8 Sep 2025 23:25:14 +0100 Subject: [PATCH 21/33] use google style docstring for LocalJob Signed-off-by: Saad Zaher --- kubeflow/trainer/backends/localprocess/job.py | 25 ++++++++----------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/kubeflow/trainer/backends/localprocess/job.py b/kubeflow/trainer/backends/localprocess/job.py index 4c593fbec..0b3c9bd89 100644 --- a/kubeflow/trainer/backends/localprocess/job.py +++ b/kubeflow/trainer/backends/localprocess/job.py @@ -33,20 +33,17 @@ def __init__( env: Dict[str, str] = None, dependencies: List = None, ): - """Create a LocalJob. Create a local subprocess with threading to allow users - to create background jobs. - :param name: The name of the job. - :type name: str - :param command: The command to run. - :type command: str - :param execution_dir: The execution directory. - :type execution_dir: str - :param debug: If true, run in debug mode. - :type debug: bool - :param env: Environment variables. - :type env: Dict[str, str] - :param dependencies: List of dependencies. - :type dependencies: List[str] + """Creates a LocalJob. + + Creates a local subprocess with threading to allow users to create background jobs. + + Args: + name (str): The name of the job. + command (str): The command to run. + execution_dir (str): The execution directory. + debug (bool, optional): If true, run in debug mode. Defaults to False. + env (Dict[str, str], optional): Environment variables. Defaults to None. + dependencies (List[str], optional): List of dependencies. Defaults to None. """ super().__init__() self.name = name From c3719b57e3c55e1885772b74c39f6cbbc877d2d0 Mon Sep 17 00:00:00 2001 From: Saad Zaher Date: Tue, 9 Sep 2025 14:41:04 +0100 Subject: [PATCH 22/33] remove debug opt from LocalProcessConfig Signed-off-by: Saad Zaher --- kubeflow/trainer/backends/localprocess/backend.py | 6 +----- kubeflow/trainer/backends/localprocess/job.py | 11 +++-------- kubeflow/trainer/backends/localprocess/types.py | 1 - 3 files changed, 4 insertions(+), 14 deletions(-) diff --git a/kubeflow/trainer/backends/localprocess/backend.py b/kubeflow/trainer/backends/localprocess/backend.py index f94c646cf..57ad8cd3a 100644 --- a/kubeflow/trainer/backends/localprocess/backend.py +++ b/kubeflow/trainer/backends/localprocess/backend.py @@ -78,8 +78,7 @@ def train( # setup runtime target_dir, python_bin, pip_bin = self.__setup_runtime(train_job_name=train_job_name) - if self.cfg.debug: - logger.info("operating in {}".format(target_dir)) + logger.debug("operating in {}".format(target_dir)) local_runtime = self.__get_full_runtime(runtime) @@ -121,7 +120,6 @@ def train( deps_job = LocalJob( name="{}-deps".format(train_job_name), command=deps_command, - debug=self.cfg.debug, execution_dir=target_dir, env=trainer.env, ) @@ -139,7 +137,6 @@ def train( train_job = LocalJob( name="{}-train".format(train_job_name), command=training_command, - debug=self.cfg.debug, execution_dir=target_dir, env=trainer.env, dependencies=training_dependencies, @@ -160,7 +157,6 @@ def train( cleanup_job = LocalJob( name="{}-cleanup".format(train_job_name), command=cleanup_command, - debug=self.cfg.debug, execution_dir=target_dir, env=trainer.env, dependencies=cleanup_dependencies, diff --git a/kubeflow/trainer/backends/localprocess/job.py b/kubeflow/trainer/backends/localprocess/job.py index 0b3c9bd89..3ef8c58e8 100644 --- a/kubeflow/trainer/backends/localprocess/job.py +++ b/kubeflow/trainer/backends/localprocess/job.py @@ -29,7 +29,6 @@ def __init__( name, command: Union[List, Tuple[str], str], execution_dir: str = None, - debug: bool = False, env: Dict[str, str] = None, dependencies: List = None, ): @@ -41,7 +40,6 @@ def __init__( name (str): The name of the job. command (str): The command to run. execution_dir (str): The execution directory. - debug (bool, optional): If true, run in debug mode. Defaults to False. env (Dict[str, str], optional): Environment variables. Defaults to None. dependencies (List[str], optional): List of dependencies. Defaults to None. """ @@ -60,7 +58,6 @@ def __init__( self._end_time = None self.env = env or {} self.dependencies = dependencies or [] - self.debug = debug self.execution_dir = execution_dir or os.getcwd() def run(self): @@ -74,9 +71,8 @@ def run(self): current_dir = os.getcwd() try: self._start_time = datetime.now() - if self.debug: - _c = " ".join(self.command) - logger.debug(f"[{self.name}] Started at {self._start_time} with command: \n {_c}") + _c = " ".join(self.command) + logger.debug(f"[{self.name}] Started at {self._start_time} with command: \n {_c}") # change working directory to venv before executing script os.chdir(self.execution_dir) @@ -124,8 +120,7 @@ def run(self): constants.TRAINJOB_COMPLETE if self._success else (constants.TRAINJOB_FAILED) ) self._stdout += msg - if self.debug: - logger.debug(self._stdout) + logger.debug("Job output: ", self._stdout) except Exception as e: with self._lock: diff --git a/kubeflow/trainer/backends/localprocess/types.py b/kubeflow/trainer/backends/localprocess/types.py index b9631694d..40a825f5a 100644 --- a/kubeflow/trainer/backends/localprocess/types.py +++ b/kubeflow/trainer/backends/localprocess/types.py @@ -25,7 +25,6 @@ class LocalProcessBackendConfig(BaseModel): cleanup: bool = True - debug: bool = False # change working directory to venv for execution. run_in_venv_dir: bool = True From 64cdcbaaf418997db9602dbbf8e1af008f00a9f0 Mon Sep 17 00:00:00 2001 From: Saad Zaher Date: Tue, 9 Sep 2025 14:42:42 +0100 Subject: [PATCH 23/33] only use imports from kubeflow.trainer for backends Signed-off-by: Saad Zaher --- kubeflow/trainer/backends/__init__.py | 25 ------------------------- 1 file changed, 25 deletions(-) diff --git a/kubeflow/trainer/backends/__init__.py b/kubeflow/trainer/backends/__init__.py index 3ccba2d53..e69de29bb 100644 --- a/kubeflow/trainer/backends/__init__.py +++ b/kubeflow/trainer/backends/__init__.py @@ -1,25 +0,0 @@ -# Copyright 2025 The Kubeflow Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from kubeflow.trainer.backends.kubernetes.backend import KubernetesBackend -from kubeflow.trainer.backends.kubernetes.types import KubernetesBackendConfig -from kubeflow.trainer.backends.localprocess.backend import LocalProcessBackend -from kubeflow.trainer.backends.localprocess.types import LocalProcessBackendConfig - -__all__ = [ - "KubernetesBackend", - "LocalProcessBackend", - "LocalProcessBackendConfig", - "KubernetesBackendConfig", -] From 511b22b376306fd0d15aea22550c4d65bf09fe8e Mon Sep 17 00:00:00 2001 From: Saad Zaher Date: Sat, 13 Sep 2025 13:09:59 +0100 Subject: [PATCH 24/33] upload local-exec to use only one step While I believe in simplicity and diving this into steps makes it easier for debugging and extensibility. Addressing comments on this PR consolidating all train job scripts into one and running it as single step to match k8s. Signed-off-by: Saad Zaher --- .../trainer/backends/localprocess/backend.py | 120 ++++-------------- .../backends/localprocess/constants.py | 27 +++- .../trainer/backends/localprocess/types.py | 2 - .../trainer/backends/localprocess/utils.py | 90 ++++++++++--- 4 files changed, 123 insertions(+), 116 deletions(-) diff --git a/kubeflow/trainer/backends/localprocess/backend.py b/kubeflow/trainer/backends/localprocess/backend.py index 57ad8cd3a..4a7afdc13 100644 --- a/kubeflow/trainer/backends/localprocess/backend.py +++ b/kubeflow/trainer/backends/localprocess/backend.py @@ -12,14 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. import logging -import os import string import tempfile import uuid -import venv import random from datetime import datetime -from pathlib import Path from typing import List, Optional, Set, Union, Iterator from kubeflow.trainer.constants import constants @@ -75,99 +72,47 @@ def train( if isinstance(trainer, types.CustomTrainer): trainer: types.CustomTrainer = trainer - # setup runtime - target_dir, python_bin, pip_bin = self.__setup_runtime(train_job_name=train_job_name) + # create temp dir + venv_dir = tempfile.mkdtemp(prefix=train_job_name) - logger.debug("operating in {}".format(target_dir)) + logger.debug("operating in {}".format(venv_dir)) local_runtime = self.__get_full_runtime(runtime) runtime.trainer = local_utils.get_runtime_trainer( - venv_dir=target_dir, - python_bin=str(python_bin), + venv_dir=venv_dir, framework=runtime.trainer.framework, ml_policy=local_runtime.ml_policy, ) training_command = [] - deps_command = [] if isinstance(trainer, types.CustomTrainer): if runtime.trainer.trainer_type != types.TrainerType.CUSTOM_TRAINER: raise ValueError(f"CustomTrainer can't be used with {runtime.name} runtime") - if trainer.packages_to_install: - deps_command = local_utils.get_dependencies_command( - python_bin=python_bin, - pip_bin=str(pip_bin), - pip_index_urls=trainer.pip_index_urls - if trainer.pip_index_urls - else constants.DEFAULT_PIP_INDEX_URLS, - packages=trainer.packages_to_install, - ) - training_command = local_utils.get_command_using_train_func( + training_command = local_utils.get_training_job_command( + trainer=trainer, runtime=runtime, - train_func=trainer.func, - train_func_parameters=trainer.func_args, - venv_dir=target_dir, train_job_name=train_job_name, + venv_dir=venv_dir, + cleanup=self.cfg.cleanup, ) - # make sure we wait for dependencies to be installed and runtime to become ready - training_dependencies = [] - # wait for all jobs to be completed then cleanup venv and other resources if needed. - cleanup_dependencies = [] - - if deps_command: - deps_job = LocalJob( - name="{}-deps".format(train_job_name), - command=deps_command, - execution_dir=target_dir, - env=trainer.env, - ) - deps_job.start() - # make sure training doesn't start before dependencies installation finish - training_dependencies.append(deps_job) - self.__register_job( - train_job_name=train_job_name, - step_name="deps", - job=deps_job, - runtime=local_runtime, - ) - - if training_command: - train_job = LocalJob( - name="{}-train".format(train_job_name), - command=training_command, - execution_dir=target_dir, - env=trainer.env, - dependencies=training_dependencies, - ) - train_job.start() - # ask cleanup job to wait for training to be completed. - cleanup_dependencies.append(train_job) - self.__register_job( - train_job_name=train_job_name, - step_name="train", - job=train_job, - runtime=local_runtime, - ) - - # if cleanup is requested. The virtualenv dir will be deleted. - if self.cfg.cleanup: - cleanup_command = local_utils.get_cleanup_command(venv_dir=target_dir) - cleanup_job = LocalJob( - name="{}-cleanup".format(train_job_name), - command=cleanup_command, - execution_dir=target_dir, - env=trainer.env, - dependencies=cleanup_dependencies, - ) - cleanup_job.start() - self.__register_job( - train_job_name=train_job_name, - step_name="cleanup", - job=cleanup_job, - runtime=local_runtime, - ) + else: + raise ValueError("Trainer type not supported") + train_job = LocalJob( + name="{}-train".format(train_job_name), + command=training_command, + execution_dir=venv_dir, + env=trainer.env, + dependencies=[], + ) + self.__register_job( + train_job_name=train_job_name, + step_name="train", + job=train_job, + runtime=local_runtime, + ) + train_job.start() return train_job_name @@ -260,22 +205,13 @@ def delete_job(self, name: str): # remove the job from the list of jobs self.__local_jobs.remove(_job) - def __setup_runtime(self, train_job_name): - target_dir = tempfile.mkdtemp(prefix=f"{train_job_name}-") - venv.create(env_dir=target_dir, with_pip=False) - - python_bin = Path(target_dir) / "bin" / "python" - if not os.path.exists(python_bin): - raise RuntimeError(f"Python executable not found at {python_bin}") - pip_bin = Path(target_dir) / "bin" / "pip" - - return target_dir, python_bin, pip_bin - def __get_full_runtime(self, runtime: types.Runtime): - target_runtime = [rt for rt in local_runtimes if rt.runtime.name == runtime.name] + target_runtime = next( + (rt for rt in local_runtimes if rt.runtime.name == runtime.name), None + ) if not target_runtime: raise ValueError(f"Runtime '{runtime.name}' not found.") - return target_runtime[0] + return target_runtime def __get_job_status(self, job: LocalBackendJobs) -> str: statuses = [_step.job.status for _step in job.steps] diff --git a/kubeflow/trainer/backends/localprocess/constants.py b/kubeflow/trainer/backends/localprocess/constants.py index edb75d155..6b677c3ca 100644 --- a/kubeflow/trainer/backends/localprocess/constants.py +++ b/kubeflow/trainer/backends/localprocess/constants.py @@ -51,19 +51,21 @@ ) ] + +# Create venv script + + # The exec script to embed training function into container command. DEPENDENCIES_SCRIPT = textwrap.dedent( """ - $PYTHON_BIN -m ensurepip --upgrade --default-pip - PIP_DISABLE_PIP_VERSION_CHECK=1 $PIP_BIN install --quiet \ - --no-warn-script-location $PIP_INDEX $PACKAGE_STR + PIP_DISABLE_PIP_VERSION_CHECK=1 pip install $QUIET \ + --no-warn-script-location $PIP_INDEX $PACKAGE_STR """ ) # activate virtualenv, then run the entrypoint from the virtualenv bin -LOCAL_EXEC_JOB_SCRIPT = textwrap.dedent( +LOCAL_EXEC_ENTRYPOINT = textwrap.dedent( """ - source $PYENV_LOCATION/bin/activate $ENTRYPOINT "$FUNC_FILE" "$PARAMETERS" """ ) @@ -79,3 +81,18 @@ rm -rf $PYENV_LOCATION """ ) + + +LOCAL_EXEC_JOB_TEMPLATE = textwrap.dedent( + """ + $OS_PYTHON_BIN -m venv --without-pip $PYENV_LOCATION + echo "Operating inside $PYENV_LOCATION" + source $PYENV_LOCATION/bin/activate + $PYENV_LOCATION/bin/python -m ensurepip --upgrade --default-pip + $DEPENDENCIES_SCRIPT + $ENTRYPOINT + $CLEANUP_SCRIPT + """ +) + +LOCAL_EXEC_FILENAME = "train_{}.py" diff --git a/kubeflow/trainer/backends/localprocess/types.py b/kubeflow/trainer/backends/localprocess/types.py index 40a825f5a..3315ac2f0 100644 --- a/kubeflow/trainer/backends/localprocess/types.py +++ b/kubeflow/trainer/backends/localprocess/types.py @@ -25,8 +25,6 @@ class LocalProcessBackendConfig(BaseModel): cleanup: bool = True - # change working directory to venv for execution. - run_in_venv_dir: bool = True class LocalRuntime(BaseModel): diff --git a/kubeflow/trainer/backends/localprocess/utils.py b/kubeflow/trainer/backends/localprocess/utils.py index fa5922576..e39a51383 100644 --- a/kubeflow/trainer/backends/localprocess/utils.py +++ b/kubeflow/trainer/backends/localprocess/utils.py @@ -1,5 +1,7 @@ import inspect import os +import shutil + import textwrap from pathlib import Path from string import Template @@ -8,12 +10,12 @@ from kubeflow_trainer_api import models from kubeflow.trainer.backends.localprocess import constants as local_exec_constants +from kubeflow.trainer.constants import constants from kubeflow.trainer.types import types def get_runtime_trainer( venv_dir: str, - python_bin: str, framework: str, ml_policy: models.TrainerV1alpha1MLPolicy, ) -> types.RuntimeTrainer: @@ -43,19 +45,22 @@ def get_runtime_trainer( return trainer -def get_dependencies_command(python_bin, pip_bin: str, pip_index_urls: str, packages: List[str]): +def get_dependencies_command(pip_index_urls: str, packages: List[str], quiet: bool = True) -> str: options = [f"--index-url {pip_index_urls[0]}"] options.extend(f"--extra-index-url {extra_index_url}" for extra_index_url in pip_index_urls[1:]) + """ + PIP_DISABLE_PIP_VERSION_CHECK=1 pip install $QUIET $AS_USER \ + --no-warn-script-location $PIP_INDEX $PACKAGE_STR + """ mapping = { - "PYTHON_BIN": python_bin, - "PIP_BIN": pip_bin, + "QUIET": "--quiet" if quiet else "", "PIP_INDEX": " ".join(options), "PACKAGE_STR": " ".join(packages), } t = Template(local_exec_constants.DEPENDENCIES_SCRIPT) result = t.substitute(**mapping) - return ("bash", "-c", result) + return result def get_command_using_train_func( @@ -64,7 +69,7 @@ def get_command_using_train_func( train_func_parameters: Optional[Dict[str, Any]], venv_dir: str, train_job_name: str, -) -> tuple: +) -> str: """ Get the Trainer container command from the given training function and parameters. """ @@ -82,9 +87,7 @@ def get_command_using_train_func( func_code = inspect.getsource(train_func) # Extract the file name where the function is defined and move it the venv directory. - func_file = Path(venv_dir) / "{}-{}".format( - train_job_name, os.path.basename(inspect.getfile(train_func)) - ) + func_file = Path(venv_dir) / local_exec_constants.LOCAL_EXEC_FILENAME.format(train_job_name) # Function might be defined in some indented scope (e.g. in another function). # We need to dedent the function code. @@ -104,21 +107,74 @@ def get_command_using_train_func( f.write(func_code) f.close() - t = Template(local_exec_constants.LOCAL_EXEC_JOB_SCRIPT) + t = Template(local_exec_constants.LOCAL_EXEC_ENTRYPOINT) mapping = { - "PARAMETERS": "", + "PARAMETERS": "", ## Torch Parameters if any "PYENV_LOCATION": venv_dir, "ENTRYPOINT": " ".join(runtime.trainer.command), "FUNC_FILE": func_file, } - command = t.safe_substitute(**mapping) + entrypoint = t.safe_substitute(**mapping) + + return entrypoint - return "bash", "-c", command +def get_cleanup_script(venv_dir: str, cleanup: bool = True) -> str: + script = "\n" + if not cleanup: + return script -def get_cleanup_command(venv_dir: str) -> tuple: - mapping = {"PYENV_LOCATION": venv_dir} t = Template(local_exec_constants.LOCAL_EXEC_JOB_CLEANUP_SCRIPT) - cleanup_command = t.substitute(**mapping) + mapping = { + "PYENV_LOCATION": venv_dir, + } + return t.substitute(**mapping) + + +def get_training_job_command( + train_job_name: str, + venv_dir: str, + trainer: types.CustomTrainer, + runtime: types.Runtime, + cleanup: bool = True, +) -> tuple: + # use local-exec train job template + t = Template(local_exec_constants.LOCAL_EXEC_JOB_TEMPLATE) + # find os python binary to create venv + python_bin = shutil.which("python") + if not python_bin: + python_bin = shutil.which("python3") + if not python_bin: + raise ValueError("No python executable found") + + # workout if dependencies needs to be installed + dependency_script = "\n" + if trainer.packages_to_install: + dependency_script = get_dependencies_command( + pip_index_urls=trainer.pip_index_urls + if trainer.pip_index_urls + else constants.DEFAULT_PIP_INDEX_URLS, + packages=trainer.packages_to_install, + quiet=False, + ) + + entrypoint = get_command_using_train_func( + venv_dir=venv_dir, + runtime=runtime, + train_func=trainer.func, + train_func_parameters=trainer.func_args, + train_job_name=train_job_name, + ) + + cleanup_script = get_cleanup_script(cleanup=cleanup, venv_dir=venv_dir) - return "bash", "-c", cleanup_command + mapping = { + "OS_PYTHON_BIN": python_bin, + "PYENV_LOCATION": venv_dir, + "DEPENDENCIES_SCRIPT": dependency_script, + "ENTRYPOINT": entrypoint, + "CLEANUP_SCRIPT": cleanup_script, + } + + command = t.safe_substitute(**mapping) + return "bash", "-c", command From 74d60a4ca56f8c1c9a0a3a610c9473afff587ed0 Mon Sep 17 00:00:00 2001 From: Saad Zaher Date: Sat, 13 Sep 2025 13:16:52 +0100 Subject: [PATCH 25/33] optimize loops when getting runtime Signed-off-by: Saad Zaher --- kubeflow/trainer/__init__.py | 4 ---- kubeflow/trainer/backends/localprocess/backend.py | 8 +++----- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/kubeflow/trainer/__init__.py b/kubeflow/trainer/__init__.py index 582eb71c6..346ef5077 100644 --- a/kubeflow/trainer/__init__.py +++ b/kubeflow/trainer/__init__.py @@ -39,9 +39,7 @@ ) # import backends and its associated configs -from kubeflow.trainer.backends.kubernetes.backend import KubernetesBackend from kubeflow.trainer.backends.kubernetes.types import KubernetesBackendConfig -from kubeflow.trainer.backends.localprocess.backend import LocalProcessBackend from kubeflow.trainer.backends.localprocess.types import LocalProcessBackendConfig @@ -62,8 +60,6 @@ "RuntimeTrainer", "TrainerClient", "TrainerType", - "KubernetesBackend", - "LocalProcessBackend", "LocalProcessBackendConfig", "KubernetesBackendConfig", ] diff --git a/kubeflow/trainer/backends/localprocess/backend.py b/kubeflow/trainer/backends/localprocess/backend.py index 4a7afdc13..1c0e4b4ca 100644 --- a/kubeflow/trainer/backends/localprocess/backend.py +++ b/kubeflow/trainer/backends/localprocess/backend.py @@ -48,11 +48,11 @@ def list_runtimes(self) -> List[types.Runtime]: return [local_runtime.runtime for local_runtime in local_runtimes] def get_runtime(self, name: str) -> Optional[types.Runtime]: - _runtime = [rt.runtime for rt in local_runtimes if rt.runtime.name == name] + _runtime = next((rt.runtime for rt in local_runtimes if rt.runtime.name == name), None) if not _runtime: raise ValueError(f"Runtime '{name}' not found.") - return _runtime[0] + return _runtime def get_runtime_packages(self, runtime: types.Runtime): raise NotImplementedError("get_runtime_packages is not supported by LocalProcessBackend") @@ -63,9 +63,7 @@ def train( initializer: Optional[types.Initializer] = None, trainer: Optional[Union[types.CustomTrainer, types.BuiltinTrainer]] = None, ) -> str: - train_job_name = "kft-{}".format( - random.choice(string.ascii_lowercase) + uuid.uuid4().hex[:11], - ) + train_job_name = random.choice(string.ascii_lowercase) + uuid.uuid4().hex[:11] # Build the env if not trainer: raise ValueError("Cannot create TrainJob without a Trainer") From 9d9a14ce2cad5b768e99be056e6656d8a2cde392 Mon Sep 17 00:00:00 2001 From: Saad Zaher Date: Sun, 14 Sep 2025 12:27:07 +0100 Subject: [PATCH 26/33] add LocalRuntimeTrainer Signed-off-by: Saad Zaher --- .../trainer/backends/localprocess/backend.py | 41 ++--- .../backends/localprocess/constants.py | 42 ++--- .../trainer/backends/localprocess/types.py | 8 +- .../trainer/backends/localprocess/utils.py | 146 ++++++++++++++++-- 4 files changed, 172 insertions(+), 65 deletions(-) diff --git a/kubeflow/trainer/backends/localprocess/backend.py b/kubeflow/trainer/backends/localprocess/backend.py index 1c0e4b4ca..52f66a4c0 100644 --- a/kubeflow/trainer/backends/localprocess/backend.py +++ b/kubeflow/trainer/backends/localprocess/backend.py @@ -26,7 +26,7 @@ LocalProcessBackendConfig, LocalBackendJobs, LocalBackendStep, - LocalRuntime, + LocalRuntimeTrainer, ) from kubeflow.trainer.backends.localprocess.constants import local_runtimes from kubeflow.trainer.backends.localprocess.job import LocalJob @@ -45,17 +45,21 @@ def __init__( self.cfg = cfg def list_runtimes(self) -> List[types.Runtime]: - return [local_runtime.runtime for local_runtime in local_runtimes] + return local_runtimes def get_runtime(self, name: str) -> Optional[types.Runtime]: - _runtime = next((rt.runtime for rt in local_runtimes if rt.runtime.name == name), None) - if not _runtime: + runtime = next((rt for rt in local_runtimes if rt.name == name), None) + if not runtime: raise ValueError(f"Runtime '{name}' not found.") - return _runtime + return runtime def get_runtime_packages(self, runtime: types.Runtime): - raise NotImplementedError("get_runtime_packages is not supported by LocalProcessBackend") + if isinstance(runtime.trainer, LocalRuntimeTrainer): + return runtime.trainer.packages + else: + logger.debug("Trainer type isn't supported by LocalProcessBackend") + return [] def train( self, @@ -75,16 +79,12 @@ def train( logger.debug("operating in {}".format(venv_dir)) - local_runtime = self.__get_full_runtime(runtime) - runtime.trainer = local_utils.get_runtime_trainer( + runtime_name=runtime.name, venv_dir=venv_dir, framework=runtime.trainer.framework, - ml_policy=local_runtime.ml_policy, ) - training_command = [] - if isinstance(trainer, types.CustomTrainer): if runtime.trainer.trainer_type != types.TrainerType.CUSTOM_TRAINER: raise ValueError(f"CustomTrainer can't be used with {runtime.name} runtime") @@ -97,6 +97,7 @@ def train( ) else: raise ValueError("Trainer type not supported") + train_job = LocalJob( name="{}-train".format(train_job_name), command=training_command, @@ -108,7 +109,7 @@ def train( train_job_name=train_job_name, step_name="train", job=train_job, - runtime=local_runtime, + runtime=runtime, ) train_job.start() @@ -118,7 +119,7 @@ def list_jobs(self, runtime: Optional[types.Runtime] = None) -> List[types.Train result = [] for _job in self.__local_jobs: - if runtime and _job.runtime.runtime.name != runtime.name: + if runtime and _job.runtime.name != runtime.name: continue result.append( types.TrainJob( @@ -149,7 +150,7 @@ def get_job(self, name: str) -> Optional[types.TrainJob]: types.Step(name=_step.step_name, pod_name=_step.step_name, status=_step.job.status) for _step in _job.steps ], - runtime=_job.runtime.runtime, + runtime=_job.runtime, num_nodes=1, status=status, ) @@ -188,7 +189,7 @@ def wait_for_job_status( raise ValueError("No TrainJob with name '%s'" % name) # find a better implementation for this for _step in _job.steps: - if _step.status in [constants.TRAINJOB_RUNNING, constants.TRAINJOB_CREATED]: + if _step.job.status in [constants.TRAINJOB_RUNNING, constants.TRAINJOB_CREATED]: _step.job.join(timeout=timeout) return self.get_job(name) @@ -203,14 +204,6 @@ def delete_job(self, name: str): # remove the job from the list of jobs self.__local_jobs.remove(_job) - def __get_full_runtime(self, runtime: types.Runtime): - target_runtime = next( - (rt for rt in local_runtimes if rt.runtime.name == runtime.name), None - ) - if not target_runtime: - raise ValueError(f"Runtime '{runtime.name}' not found.") - return target_runtime - def __get_job_status(self, job: LocalBackendJobs) -> str: statuses = [_step.job.status for _step in job.steps] # if status is running or failed will take precedence over completed @@ -230,7 +223,7 @@ def __register_job( train_job_name: str, step_name: str, job: LocalJob, - runtime: LocalRuntime = None, + runtime: types.Runtime = None, ): _job = [j for j in self.__local_jobs if j.name == train_job_name] if not _job: diff --git a/kubeflow/trainer/backends/localprocess/constants.py b/kubeflow/trainer/backends/localprocess/constants.py index 6b677c3ca..7ac0868fe 100644 --- a/kubeflow/trainer/backends/localprocess/constants.py +++ b/kubeflow/trainer/backends/localprocess/constants.py @@ -13,40 +13,23 @@ # limitations under the License. import textwrap -from kubeflow_trainer_api.models.trainer_v1alpha1_ml_policy import TrainerV1alpha1MLPolicy -from kubeflow_trainer_api.models.trainer_v1alpha1_torch_ml_policy_source import ( - TrainerV1alpha1TorchMLPolicySource, -) -from kubeflow_trainer_api.models.trainer_v1alpha1_torch_elastic_policy import ( - TrainerV1alpha1TorchElasticPolicy, -) -from kubeflow_trainer_api.models.io_k8s_apimachinery_pkg_util_intstr_int_or_string import ( - IoK8sApimachineryPkgUtilIntstrIntOrString, -) +import re from kubeflow.trainer.types import types as base_types from kubeflow.trainer.constants import constants from kubeflow.trainer.backends.localprocess import types +TORCH_FRAMEWORK_TYPE = "torch" local_runtimes = [ - types.LocalRuntime( - runtime=base_types.Runtime( - name=constants.TORCH_RUNTIME, - trainer=base_types.RuntimeTrainer( - trainer_type=base_types.TrainerType.CUSTOM_TRAINER, - framework="torch", - num_nodes=1, - device=constants.UNKNOWN, - device_count=constants.UNKNOWN, - ), - ), - ml_policy=TrainerV1alpha1MLPolicy( - torch=TrainerV1alpha1TorchMLPolicySource( - elasticPolicy=TrainerV1alpha1TorchElasticPolicy( - maxNodes=1, minNodes=1, maxRestarts=1 - ), - numProcPerNode=IoK8sApimachineryPkgUtilIntstrIntOrString(1), - ) + base_types.Runtime( + name=constants.TORCH_RUNTIME, + trainer=types.LocalRuntimeTrainer( + trainer_type=base_types.TrainerType.CUSTOM_TRAINER, + framework=TORCH_FRAMEWORK_TYPE, + num_nodes=1, + device_count=constants.UNKNOWN, + device=constants.UNKNOWN, + packages=["torch"], ), ) ] @@ -85,6 +68,7 @@ LOCAL_EXEC_JOB_TEMPLATE = textwrap.dedent( """ + set -e $OS_PYTHON_BIN -m venv --without-pip $PYENV_LOCATION echo "Operating inside $PYENV_LOCATION" source $PYENV_LOCATION/bin/activate @@ -96,3 +80,5 @@ ) LOCAL_EXEC_FILENAME = "train_{}.py" + +PYTHON_PACKAGE_NAME_RE = re.compile(r"^\s*([A-Za-z0-9][A-Za-z0-9._-]*)") diff --git a/kubeflow/trainer/backends/localprocess/types.py b/kubeflow/trainer/backends/localprocess/types.py index 3315ac2f0..7e91122a1 100644 --- a/kubeflow/trainer/backends/localprocess/types.py +++ b/kubeflow/trainer/backends/localprocess/types.py @@ -13,6 +13,7 @@ # limitations under the License. import typing +from dataclasses import dataclass, field from datetime import datetime from typing import List, Optional @@ -27,6 +28,11 @@ class LocalProcessBackendConfig(BaseModel): cleanup: bool = True +@dataclass +class LocalRuntimeTrainer(types.RuntimeTrainer): + packages: List[str] = field(default_factory=list) + + class LocalRuntime(BaseModel): runtime: types.Runtime = None ml_policy: trainer_v1alpha1_ml_policy.TrainerV1alpha1MLPolicy @@ -45,7 +51,7 @@ class Config: class LocalBackendJobs(BaseModel): steps: Optional[List[LocalBackendStep]] = [] - runtime: Optional[LocalRuntime] = None + runtime: Optional[types.Runtime] = None name: str created: typing.Optional[datetime] = None completed: typing.Optional[datetime] = None diff --git a/kubeflow/trainer/backends/localprocess/utils.py b/kubeflow/trainer/backends/localprocess/utils.py index e39a51383..47d7e96f8 100644 --- a/kubeflow/trainer/backends/localprocess/utils.py +++ b/kubeflow/trainer/backends/localprocess/utils.py @@ -1,42 +1,147 @@ import inspect import os import shutil - +import re import textwrap from pathlib import Path from string import Template -from typing import List, Callable, Optional, Dict, Any - -from kubeflow_trainer_api import models +from typing import List, Callable, Optional, Dict, Any, Tuple, Set from kubeflow.trainer.backends.localprocess import constants as local_exec_constants from kubeflow.trainer.constants import constants from kubeflow.trainer.types import types +from kubeflow.trainer.backends.localprocess.types import LocalRuntimeTrainer + + +def _extract_name(requirement: str) -> str: + """ + Extract the base distribution name from a requirement string without external deps. + + Supports common PEP 508 patterns: + - 'package' + - 'package[extra1,extra2]' + - 'package==1.2.3', 'package>=1.0', 'package~=1.4', etc. + - 'package @ https://...' + - markers after ';' are irrelevant for name extraction. + + Returns the *raw* (un-normalized) name as it appears. + Raises ValueError if a name cannot be parsed. + """ + if requirement is None: + raise ValueError("Requirement string cannot be None") + s = requirement.strip() + if not s: + raise ValueError("Empty requirement string") + + m = local_exec_constants.PYTHON_PACKAGE_NAME_RE.match(s) + if not m: + raise ValueError(f"Could not parse package name from requirement: {requirement!r}") + return m.group(1) + + +def _canonicalize_name(name: str) -> str: + """ + PEP 503-style normalization: case-insensitive, and collapse runs of -, _, . into '-'. + """ + return re.sub(r"[-_.]+", "-", name).lower() + + +def get_install_packages( + runtime_packages: List[str], + trainer_packages: Optional[List[str]] = None, +) -> List[str]: + """ + Merge two requirement lists into a single list of strings. + + Rules implemented: + 1) If a package appears in trainer_packages, it overwrites the one in runtime_packages. + We keep the *trainer string verbatim* (specifier, markers, extras, spacing). + 2) Case-insensitive matching of package names (PEP 503-style normalization). + 3) Output is a list of strings. + 4) If trainer_packages contains the same dependency multiple times (case-insensitive), + raise ValueError. + 5) If runtime_packages contains duplicates, the last one among *runtime* wins there + (no error), but any trainer entry still overwrites it. Runtime packages shouldn't + have any duplicates. + 6) Ordering: keep runtime-only packages in their original order (emitting only their + last occurrence), then append all trainer packages in their original order. + """ + if not trainer_packages: + return runtime_packages + + # --- Parse + normalize runtime --- + runtime_parsed: List[Tuple[str, str]] = [] # (orig, canonical_name) + last_runtime_index_by_name: Dict[str, int] = {} + + for i, orig in enumerate(runtime_packages): + raw_name = _extract_name(orig) + canon = _canonicalize_name(raw_name) + runtime_parsed.append((orig, canon)) + last_runtime_index_by_name[canon] = i # last occurrence index wins among runtime + + # --- Parse + validate trainer (detect duplicates) --- + trainer_parsed: List[Tuple[str, str]] = [] + seen_trainer: Set[str] = set() + for orig in trainer_packages: + raw_name = _extract_name(orig) + canon = _canonicalize_name(raw_name) + if canon in seen_trainer: + raise ValueError( + f"Duplicate dependency in trainer_packages: '{raw_name}' (canonical: '{canon}')" + ) + seen_trainer.add(canon) + trainer_parsed.append((orig, canon)) + + trainer_names: Set[str] = {canon for _, canon in trainer_parsed} + + # --- Build merged list respecting order semantics --- + merged: List[str] = [] + + # 1) Runtime-only packages (only emit the last occurrence for each name) + emitted_runtime_names: Set[str] = set() + for idx, (orig, canon) in enumerate(runtime_parsed): + if canon in trainer_names: + continue # overwritten by trainer + if last_runtime_index_by_name[canon] == idx and canon not in emitted_runtime_names: + merged.append(orig) + emitted_runtime_names.add(canon) + + # 2) Trainer packages (overwrite and preserve trainer's exact strings, original order) + for orig, _ in trainer_parsed: + merged.append(orig) + + return merged def get_runtime_trainer( + runtime_name: str, venv_dir: str, framework: str, - ml_policy: models.TrainerV1alpha1MLPolicy, -) -> types.RuntimeTrainer: +) -> LocalRuntimeTrainer: """ - Get the RuntimeTrainer object. + Get the LocalRuntimeTrainer object. """ + local_runtime = next( + (rt for rt in local_exec_constants.local_runtimes if rt.name == runtime_name), None + ) + if not local_runtime: + raise ValueError(f"Runtime {runtime_name} not found") - trainer = types.RuntimeTrainer( + trainer = LocalRuntimeTrainer( trainer_type=( types.TrainerType.BUILTIN_TRAINER if framework == types.TORCH_TUNE else types.TrainerType.CUSTOM_TRAINER ), framework=framework, + packages=local_runtime.trainer.packages, ) # set command to run from venv venv_bin_dir = str(Path(venv_dir) / "bin") default_cmd = [str(Path(venv_bin_dir) / local_exec_constants.DEFAULT_COMMAND)] # Set the Trainer entrypoint. - if ml_policy.torch: + if framework == local_exec_constants.TORCH_FRAMEWORK_TYPE: _c = [os.path.join(venv_bin_dir, local_exec_constants.TORCH_COMMAND)] trainer.set_command(tuple(_c)) else: @@ -45,7 +150,18 @@ def get_runtime_trainer( return trainer -def get_dependencies_command(pip_index_urls: str, packages: List[str], quiet: bool = True) -> str: +def get_dependencies_command( + runtime_packages: List[str], + pip_index_urls: str, + trainer_packages: List[str], + quiet: bool = True, +) -> str: + # resolve runtime dependencies and trainer dependencies. + packages = get_install_packages( + runtime_packages=runtime_packages, + trainer_packages=trainer_packages, + ) + options = [f"--index-url {pip_index_urls[0]}"] options.extend(f"--extra-index-url {extra_index_url}" for extra_index_url in pip_index_urls[1:]) @@ -56,7 +172,7 @@ def get_dependencies_command(pip_index_urls: str, packages: List[str], quiet: bo mapping = { "QUIET": "--quiet" if quiet else "", "PIP_INDEX": " ".join(options), - "PACKAGE_STR": " ".join(packages), + "PACKAGE_STR": '"{}"'.format('" "'.join(packages)), # quote deps } t = Template(local_exec_constants.DEPENDENCIES_SCRIPT) result = t.substitute(**mapping) @@ -148,13 +264,18 @@ def get_training_job_command( raise ValueError("No python executable found") # workout if dependencies needs to be installed + if isinstance(runtime.trainer, LocalRuntimeTrainer): + runtime_trainer: LocalRuntimeTrainer = runtime.trainer + else: + raise ValueError("Invalid Runtime Trainer type: {type(runtime.trainer)}") dependency_script = "\n" if trainer.packages_to_install: dependency_script = get_dependencies_command( pip_index_urls=trainer.pip_index_urls if trainer.pip_index_urls else constants.DEFAULT_PIP_INDEX_URLS, - packages=trainer.packages_to_install, + runtime_packages=runtime_trainer.packages, + trainer_packages=trainer.packages_to_install, quiet=False, ) @@ -177,4 +298,5 @@ def get_training_job_command( } command = t.safe_substitute(**mapping) + return "bash", "-c", command From 60d96d00eb18018dc69dc4aca827a4f11d256052 Mon Sep 17 00:00:00 2001 From: Saad Zaher Date: Sun, 14 Sep 2025 12:54:13 +0100 Subject: [PATCH 27/33] rename cleanup config item to cleanup_venv Signed-off-by: Saad Zaher --- .../trainer/backends/localprocess/backend.py | 35 +++++++++---------- .../trainer/backends/localprocess/types.py | 2 +- 2 files changed, 17 insertions(+), 20 deletions(-) diff --git a/kubeflow/trainer/backends/localprocess/backend.py b/kubeflow/trainer/backends/localprocess/backend.py index 52f66a4c0..3b08fe988 100644 --- a/kubeflow/trainer/backends/localprocess/backend.py +++ b/kubeflow/trainer/backends/localprocess/backend.py @@ -47,7 +47,7 @@ def __init__( def list_runtimes(self) -> List[types.Runtime]: return local_runtimes - def get_runtime(self, name: str) -> Optional[types.Runtime]: + def get_runtime(self, name: str) -> types.Runtime: runtime = next((rt for rt in local_runtimes if rt.name == name), None) if not runtime: raise ValueError(f"Runtime '{name}' not found.") @@ -67,16 +67,14 @@ def train( initializer: Optional[types.Initializer] = None, trainer: Optional[Union[types.CustomTrainer, types.BuiltinTrainer]] = None, ) -> str: + # set train job name train_job_name = random.choice(string.ascii_lowercase) + uuid.uuid4().hex[:11] - # Build the env - if not trainer: - raise ValueError("Cannot create TrainJob without a Trainer") - if isinstance(trainer, types.CustomTrainer): - trainer: types.CustomTrainer = trainer + # localprocess backend only supports CustomTrainer + if not isinstance(trainer, types.CustomTrainer): + raise ValueError("CustomTrainer must be set with LocalProcessBackend") # create temp dir venv_dir = tempfile.mkdtemp(prefix=train_job_name) - logger.debug("operating in {}".format(venv_dir)) runtime.trainer = local_utils.get_runtime_trainer( @@ -85,19 +83,16 @@ def train( framework=runtime.trainer.framework, ) - if isinstance(trainer, types.CustomTrainer): - if runtime.trainer.trainer_type != types.TrainerType.CUSTOM_TRAINER: - raise ValueError(f"CustomTrainer can't be used with {runtime.name} runtime") - training_command = local_utils.get_training_job_command( - trainer=trainer, - runtime=runtime, - train_job_name=train_job_name, - venv_dir=venv_dir, - cleanup=self.cfg.cleanup, - ) - else: - raise ValueError("Trainer type not supported") + # build training job command + training_command = local_utils.get_training_job_command( + trainer=trainer, + runtime=runtime, + train_job_name=train_job_name, + venv_dir=venv_dir, + cleanup=self.cfg.cleanup_venv, + ) + # create subprocess object train_job = LocalJob( name="{}-train".format(train_job_name), command=training_command, @@ -105,12 +100,14 @@ def train( env=trainer.env, dependencies=[], ) + self.__register_job( train_job_name=train_job_name, step_name="train", job=train_job, runtime=runtime, ) + # start the job. train_job.start() return train_job_name diff --git a/kubeflow/trainer/backends/localprocess/types.py b/kubeflow/trainer/backends/localprocess/types.py index 7e91122a1..b77fe4ec7 100644 --- a/kubeflow/trainer/backends/localprocess/types.py +++ b/kubeflow/trainer/backends/localprocess/types.py @@ -25,7 +25,7 @@ class LocalProcessBackendConfig(BaseModel): - cleanup: bool = True + cleanup_venv: bool = True @dataclass From 8e9190e466de441451d3cb9b659097fd4c0aa12c Mon Sep 17 00:00:00 2001 From: Saad Zaher Date: Sun, 14 Sep 2025 16:27:12 +0100 Subject: [PATCH 28/33] convert local runtime to runtime Signed-off-by: Saad Zaher --- .../trainer/backends/localprocess/backend.py | 24 +++++++++++++++++-- .../trainer/backends/localprocess/types.py | 9 ------- 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/kubeflow/trainer/backends/localprocess/backend.py b/kubeflow/trainer/backends/localprocess/backend.py index 3b08fe988..29d059f23 100644 --- a/kubeflow/trainer/backends/localprocess/backend.py +++ b/kubeflow/trainer/backends/localprocess/backend.py @@ -45,10 +45,17 @@ def __init__( self.cfg = cfg def list_runtimes(self) -> List[types.Runtime]: - return local_runtimes + return [self.__convert_local_runtime_to_runtime(local_runtime=rt) for rt in local_runtimes] def get_runtime(self, name: str) -> types.Runtime: - runtime = next((rt for rt in local_runtimes if rt.name == name), None) + runtime = next( + ( + self.__convert_local_runtime_to_runtime(rt) + for rt in local_runtimes + if rt.name == name + ), + None, + ) if not runtime: raise ValueError(f"Runtime '{name}' not found.") @@ -234,3 +241,16 @@ def __register_job( _job.steps.append(_step) else: logger.warning("Step '{}' already registered.".format(step_name)) + + def __convert_local_runtime_to_runtime(self, local_runtime) -> types.Runtime: + return types.Runtime( + name=local_runtime.name, + trainer=types.RuntimeTrainer( + trainer_type=local_runtime.trainer.trainer_type, + framework=local_runtime.trainer.framework, + num_nodes=local_runtime.trainer.num_nodes, + device_count=local_runtime.trainer.device_count, + device=local_runtime.trainer.device, + ), + pretrained_model=local_runtime.pretrained_model, + ) diff --git a/kubeflow/trainer/backends/localprocess/types.py b/kubeflow/trainer/backends/localprocess/types.py index b77fe4ec7..d2ee0145b 100644 --- a/kubeflow/trainer/backends/localprocess/types.py +++ b/kubeflow/trainer/backends/localprocess/types.py @@ -18,7 +18,6 @@ from typing import List, Optional from pydantic import BaseModel -from kubeflow_trainer_api.models import trainer_v1alpha1_ml_policy from kubeflow.trainer.backends.localprocess.job import LocalJob from kubeflow.trainer.types import types @@ -33,14 +32,6 @@ class LocalRuntimeTrainer(types.RuntimeTrainer): packages: List[str] = field(default_factory=list) -class LocalRuntime(BaseModel): - runtime: types.Runtime = None - ml_policy: trainer_v1alpha1_ml_policy.TrainerV1alpha1MLPolicy - - class Config: - arbitrary_types_allowed = True - - class LocalBackendStep(BaseModel): step_name: str job: LocalJob From ac0be0c1bc2302a4103c4f90aa5ec83864a42c78 Mon Sep 17 00:00:00 2001 From: Saad Zaher Date: Sun, 14 Sep 2025 16:39:02 +0100 Subject: [PATCH 29/33] convert runtimes before returning Signed-off-by: Saad Zaher --- kubeflow/trainer/backends/localprocess/backend.py | 13 ++++++------- kubeflow/trainer/backends/localprocess/utils.py | 14 +++++--------- 2 files changed, 11 insertions(+), 16 deletions(-) diff --git a/kubeflow/trainer/backends/localprocess/backend.py b/kubeflow/trainer/backends/localprocess/backend.py index 29d059f23..81c6cc12b 100644 --- a/kubeflow/trainer/backends/localprocess/backend.py +++ b/kubeflow/trainer/backends/localprocess/backend.py @@ -26,7 +26,6 @@ LocalProcessBackendConfig, LocalBackendJobs, LocalBackendStep, - LocalRuntimeTrainer, ) from kubeflow.trainer.backends.localprocess.constants import local_runtimes from kubeflow.trainer.backends.localprocess.job import LocalJob @@ -62,11 +61,11 @@ def get_runtime(self, name: str) -> types.Runtime: return runtime def get_runtime_packages(self, runtime: types.Runtime): - if isinstance(runtime.trainer, LocalRuntimeTrainer): - return runtime.trainer.packages - else: - logger.debug("Trainer type isn't supported by LocalProcessBackend") - return [] + runtime = next((rt for rt in local_runtimes if rt.name == runtime.name), None) + if not runtime: + raise ValueError(f"Runtime '{runtime.name}' not found.") + + return runtime.trainer.packages def train( self, @@ -96,7 +95,7 @@ def train( runtime=runtime, train_job_name=train_job_name, venv_dir=venv_dir, - cleanup=self.cfg.cleanup_venv, + cleanup_venv=self.cfg.cleanup_venv, ) # create subprocess object diff --git a/kubeflow/trainer/backends/localprocess/utils.py b/kubeflow/trainer/backends/localprocess/utils.py index 47d7e96f8..e60e0d98c 100644 --- a/kubeflow/trainer/backends/localprocess/utils.py +++ b/kubeflow/trainer/backends/localprocess/utils.py @@ -128,11 +128,7 @@ def get_runtime_trainer( raise ValueError(f"Runtime {runtime_name} not found") trainer = LocalRuntimeTrainer( - trainer_type=( - types.TrainerType.BUILTIN_TRAINER - if framework == types.TORCH_TUNE - else types.TrainerType.CUSTOM_TRAINER - ), + trainer_type=types.TrainerType.CUSTOM_TRAINER, framework=framework, packages=local_runtime.trainer.packages, ) @@ -235,9 +231,9 @@ def get_command_using_train_func( return entrypoint -def get_cleanup_script(venv_dir: str, cleanup: bool = True) -> str: +def get_cleanup_venv_script(venv_dir: str, cleanup_venv: bool = True) -> str: script = "\n" - if not cleanup: + if not cleanup_venv: return script t = Template(local_exec_constants.LOCAL_EXEC_JOB_CLEANUP_SCRIPT) @@ -252,7 +248,7 @@ def get_training_job_command( venv_dir: str, trainer: types.CustomTrainer, runtime: types.Runtime, - cleanup: bool = True, + cleanup_venv: bool = True, ) -> tuple: # use local-exec train job template t = Template(local_exec_constants.LOCAL_EXEC_JOB_TEMPLATE) @@ -287,7 +283,7 @@ def get_training_job_command( train_job_name=train_job_name, ) - cleanup_script = get_cleanup_script(cleanup=cleanup, venv_dir=venv_dir) + cleanup_script = get_cleanup_venv_script(cleanup_venv=cleanup_venv, venv_dir=venv_dir) mapping = { "OS_PYTHON_BIN": python_bin, From 4b1db8cab3350b5e0efb56e18f96b06240c3fa37 Mon Sep 17 00:00:00 2001 From: Saad Zaher Date: Sun, 14 Sep 2025 16:42:54 +0100 Subject: [PATCH 30/33] fix get_job_logs to align with parent interface Signed-off-by: Saad Zaher --- kubeflow/trainer/backends/localprocess/backend.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kubeflow/trainer/backends/localprocess/backend.py b/kubeflow/trainer/backends/localprocess/backend.py index 81c6cc12b..611c8b333 100644 --- a/kubeflow/trainer/backends/localprocess/backend.py +++ b/kubeflow/trainer/backends/localprocess/backend.py @@ -161,9 +161,8 @@ def get_job(self, name: str) -> Optional[types.TrainJob]: def get_job_logs( self, name: str, - follow: Optional[bool] = False, step: str = constants.NODE + "-0", - node_rank: int = 0, + follow: Optional[bool] = False, ) -> Iterator[str]: _job = [j for j in self.__local_jobs if j.name == name] if not _job: From 4fe7baa09bb03d9e9116547ead3b6cc77de02202 Mon Sep 17 00:00:00 2001 From: Saad Zaher Date: Sun, 14 Sep 2025 18:21:44 +0100 Subject: [PATCH 31/33] rename get_runtime_trainer func Signed-off-by: Saad Zaher --- kubeflow/trainer/backends/localprocess/backend.py | 5 ++++- kubeflow/trainer/backends/localprocess/utils.py | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/kubeflow/trainer/backends/localprocess/backend.py b/kubeflow/trainer/backends/localprocess/backend.py index 611c8b333..bda23680d 100644 --- a/kubeflow/trainer/backends/localprocess/backend.py +++ b/kubeflow/trainer/backends/localprocess/backend.py @@ -83,7 +83,7 @@ def train( venv_dir = tempfile.mkdtemp(prefix=train_job_name) logger.debug("operating in {}".format(venv_dir)) - runtime.trainer = local_utils.get_runtime_trainer( + runtime.trainer = local_utils.get_local_runtime_trainer( runtime_name=runtime.name, venv_dir=venv_dir, framework=runtime.trainer.framework, @@ -98,6 +98,9 @@ def train( cleanup_venv=self.cfg.cleanup_venv, ) + # set the command in the runtime trainer + runtime.trainer.set_command(training_command) + # create subprocess object train_job = LocalJob( name="{}-train".format(train_job_name), diff --git a/kubeflow/trainer/backends/localprocess/utils.py b/kubeflow/trainer/backends/localprocess/utils.py index e60e0d98c..5bc345a05 100644 --- a/kubeflow/trainer/backends/localprocess/utils.py +++ b/kubeflow/trainer/backends/localprocess/utils.py @@ -113,7 +113,7 @@ def get_install_packages( return merged -def get_runtime_trainer( +def get_local_runtime_trainer( runtime_name: str, venv_dir: str, framework: str, From 9775f3fbcf2609ef55ff94b3fdd366573a3a6115 Mon Sep 17 00:00:00 2001 From: Saad Zaher Date: Sun, 14 Sep 2025 18:32:50 +0100 Subject: [PATCH 32/33] rename get_training_job_command to get_local_train_job_script Signed-off-by: Saad Zaher --- kubeflow/trainer/backends/localprocess/backend.py | 2 +- kubeflow/trainer/backends/localprocess/utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kubeflow/trainer/backends/localprocess/backend.py b/kubeflow/trainer/backends/localprocess/backend.py index bda23680d..4fbb2d465 100644 --- a/kubeflow/trainer/backends/localprocess/backend.py +++ b/kubeflow/trainer/backends/localprocess/backend.py @@ -90,7 +90,7 @@ def train( ) # build training job command - training_command = local_utils.get_training_job_command( + training_command = local_utils.get_local_train_job_script( trainer=trainer, runtime=runtime, train_job_name=train_job_name, diff --git a/kubeflow/trainer/backends/localprocess/utils.py b/kubeflow/trainer/backends/localprocess/utils.py index 5bc345a05..c86e6f816 100644 --- a/kubeflow/trainer/backends/localprocess/utils.py +++ b/kubeflow/trainer/backends/localprocess/utils.py @@ -243,7 +243,7 @@ def get_cleanup_venv_script(venv_dir: str, cleanup_venv: bool = True) -> str: return t.substitute(**mapping) -def get_training_job_command( +def get_local_train_job_script( train_job_name: str, venv_dir: str, trainer: types.CustomTrainer, From 42c77696c0f87618b7806dc9aec8a32153c9a77e Mon Sep 17 00:00:00 2001 From: Andrey Velichkevich Date: Mon, 15 Sep 2025 01:40:55 +0100 Subject: [PATCH 33/33] Ignore failures in Coveralls action Signed-off-by: Andrey Velichkevich --- .github/workflows/test-python.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test-python.yaml b/.github/workflows/test-python.yaml index 850bb314f..9023f5ffd 100644 --- a/.github/workflows/test-python.yaml +++ b/.github/workflows/test-python.yaml @@ -15,7 +15,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: ['3.9', '3.11'] + python-version: ["3.9", "3.11"] name: Test (Python ${{ matrix.python-version }}) @@ -36,6 +36,7 @@ jobs: - name: Upload coverage to Coveralls uses: coverallsapp/github-action@v2 + continue-on-error: true with: github-token: ${{ secrets.GITHUB_TOKEN }} parallel: true @@ -48,6 +49,7 @@ jobs: steps: - name: Close parallel build uses: coverallsapp/github-action@v2 + continue-on-error: true with: github-token: ${{ secrets.GITHUB_TOKEN }} parallel-finished: true