Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,45 @@ TrainerClient().wait_for_job_status(job_id)
print("\n".join(TrainerClient().get_job_logs(name=job_id)))
```

## Local Development
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we also include docs about Trainer local execution to the user guides ?
https://www.kubeflow.org/docs/components/trainer/user-guides/
you can also add info from the @szaher PR: #95

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

WIP PR for this kubeflow/website#4221


Kubeflow SDK provides first-class support for local development, allowing you to test and iterate on your models without needing a Kubernetes cluster.

### Execution Backends

Choose the backend that fits your development workflow:

| Backend | Description | Use Case |
|---------|-------------|----------|
| **KubernetesBackend** | Run jobs on Kubernetes cluster | Production, multi-node distributed training |
| **ContainerBackend** | Auto-detects Docker or Podman | Local development with container isolation |
| **LocalProcessBackend** | Run as local Python subprocesses | Quick prototyping, debugging |

### Local Container Execution

The **ContainerBackend** automatically detects and uses either Docker or Podman:

```bash
# Install with Docker support
pip install kubeflow[docker]

# Or install with Podman support
pip install kubeflow[podman]
```

```python
from kubeflow.trainer import TrainerClient, ContainerBackendConfig, CustomTrainer

# Auto-detects Docker or Podman
config = ContainerBackendConfig()
client = TrainerClient(backend_config=config)

# Your training runs in isolated containers
job_id = client.train(trainer=CustomTrainer(func=train_fn))
```

For detailed configuration options and platform-specific setup (macOS, Linux), see the [ContainerBackend documentation](kubeflow/trainer/backends/container/README.md).

## Supported Kubeflow Projects

| Project | Status | Version Support | Description |
Expand Down
2 changes: 2 additions & 0 deletions kubeflow/trainer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

# Import the Kubeflow Trainer client.
from kubeflow.trainer.api.trainer_client import TrainerClient # noqa: F401
from kubeflow.trainer.backends.container.types import ContainerBackendConfig

# import backends and its associated configs
from kubeflow.trainer.backends.kubernetes.types import KubernetesBackendConfig
Expand Down Expand Up @@ -58,5 +59,6 @@
"TrainerClient",
"TrainerType",
"LocalProcessBackendConfig",
"ContainerBackendConfig",
"KubernetesBackendConfig",
]
17 changes: 13 additions & 4 deletions kubeflow/trainer/api/trainer_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
import logging
from typing import Optional, Union

from kubeflow.trainer.backends.container.backend import ContainerBackend
from kubeflow.trainer.backends.container.types import ContainerBackendConfig
from kubeflow.trainer.backends.kubernetes.backend import KubernetesBackend
from kubeflow.trainer.backends.kubernetes.types import KubernetesBackendConfig
from kubeflow.trainer.backends.localprocess.backend import (
Expand All @@ -31,14 +33,19 @@
class TrainerClient:
def __init__(
self,
backend_config: Union[KubernetesBackendConfig, LocalProcessBackendConfig] = None,
backend_config: Union[
KubernetesBackendConfig,
LocalProcessBackendConfig,
ContainerBackendConfig,
] = None,
):
"""Initialize a Kubeflow Trainer client.

Args:
backend_config: Backend configuration. Either KubernetesBackendConfig or
LocalProcessBackendConfig, or None to use the backend's
default config class. Defaults to KubernetesBackendConfig.
backend_config: Backend configuration. Either KubernetesBackendConfig,
LocalProcessBackendConfig, ContainerBackendConfig,
or None to use the backend's default config class.
Defaults to KubernetesBackendConfig.

Raises:
ValueError: Invalid backend configuration.
Expand All @@ -52,6 +59,8 @@ def __init__(
self.backend = KubernetesBackend(backend_config)
elif isinstance(backend_config, LocalProcessBackendConfig):
self.backend = LocalProcessBackend(backend_config)
elif isinstance(backend_config, ContainerBackendConfig):
self.backend = ContainerBackend(backend_config)
else:
raise ValueError(f"Invalid backend config '{backend_config}'")

Expand Down
18 changes: 18 additions & 0 deletions kubeflow/trainer/backends/container/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Copyright 2025 The Kubeflow Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from kubeflow.trainer.backends.container.backend import ContainerBackend
from kubeflow.trainer.backends.container.types import ContainerBackendConfig

__all__ = ["ContainerBackend", "ContainerBackendConfig"]
31 changes: 31 additions & 0 deletions kubeflow/trainer/backends/container/adapters/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# Copyright 2025 The Kubeflow Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Container runtime adapters.

This package provides adapter implementations for different container runtimes
(Docker and Podman), allowing the Container backend to work with either runtime
through a unified interface.
"""

from kubeflow.trainer.backends.container.adapters.base import BaseContainerClientAdapter
from kubeflow.trainer.backends.container.adapters.docker import DockerClientAdapter
from kubeflow.trainer.backends.container.adapters.podman import PodmanClientAdapter

__all__ = [
"BaseContainerClientAdapter",
"DockerClientAdapter",
"PodmanClientAdapter",
]
196 changes: 196 additions & 0 deletions kubeflow/trainer/backends/container/adapters/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
# Copyright 2025 The Kubeflow Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Container client adapters for Docker and Podman.

This module implements the adapter pattern to abstract away differences between
Docker and Podman APIs, allowing the backend to work with either runtime through
a common interface.
"""

from __future__ import annotations

import abc
from collections.abc import Iterator


class BaseContainerClientAdapter(abc.ABC):
"""
Abstract adapter interface for container clients.

This adapter abstracts the container runtime API, allowing the backend
to work with Docker and Podman through a unified interface.
"""

@abc.abstractmethod
def ping(self):
"""Test the connection to the container runtime."""
raise NotImplementedError()

@abc.abstractmethod
def create_network(
self,
name: str,
labels: dict[str, str],
) -> str:
"""
Create a container network.

Args:
name: Network name
labels: Labels to attach to the network

Returns:
Network ID or name
"""
raise NotImplementedError()

@abc.abstractmethod
def delete_network(self, network_id: str):
"""Delete a network."""
raise NotImplementedError()

@abc.abstractmethod
def create_and_start_container(
self,
image: str,
command: list[str],
name: str,
network_id: str,
environment: dict[str, str],
labels: dict[str, str],
volumes: dict[str, dict[str, str]],
working_dir: str,
gpus: int | bool | None = None,
) -> str:
"""
Create and start a container.

Args:
image: Container image
command: Command to run
name: Container name
network_id: Network to attach to
environment: Environment variables
labels: Container labels
volumes: Volume mounts
working_dir: Working directory
gpus: GPU configuration (None=no GPU, True=all GPUs, int=number of GPUs)

Returns:
Container ID
"""
raise NotImplementedError()

@abc.abstractmethod
def get_container(self, container_id: str):
"""Get container object by ID."""
raise NotImplementedError()

@abc.abstractmethod
def container_logs(self, container_id: str, follow: bool) -> Iterator[str]:
"""Stream logs from a container."""
raise NotImplementedError()

@abc.abstractmethod
def stop_container(self, container_id: str, timeout: int = 10):
"""Stop a container."""
raise NotImplementedError()

@abc.abstractmethod
def remove_container(self, container_id: str, force: bool = True):
"""Remove a container."""
raise NotImplementedError()

@abc.abstractmethod
def pull_image(self, image: str):
"""Pull an image."""
raise NotImplementedError()

@abc.abstractmethod
def image_exists(self, image: str) -> bool:
"""Check if an image exists locally."""
raise NotImplementedError()

@abc.abstractmethod
def run_oneoff_container(self, image: str, command: list[str]) -> str:
"""
Run a short-lived container and return its output.

Args:
image: Container image
command: Command to run

Returns:
Container output as string
"""
raise NotImplementedError()

@abc.abstractmethod
def container_status(self, container_id: str) -> tuple[str, int | None]:
"""
Get container status.

Returns:
Tuple of (status_string, exit_code)
Status strings: "running", "created", "exited", etc.
Exit code is None if container hasn't exited
"""
raise NotImplementedError()

@abc.abstractmethod
def get_container_ip(self, container_id: str, network_id: str) -> str | None:
"""
Get container's IP address on a specific network.

Args:
container_id: Container ID
network_id: Network name or ID

Returns:
IP address string or None if not found
"""
raise NotImplementedError()

@abc.abstractmethod
def list_containers(self, filters: dict[str, str] | None = None) -> list[dict]:
"""
List containers, optionally filtered by labels.

Args:
filters: Dictionary of filters (e.g., {"label": ["key=value"]})

Returns:
List of container info dictionaries with keys:
- id: Container ID
- name: Container name
- labels: Dictionary of labels
- status: Container status
- created: Creation timestamp
"""
raise NotImplementedError()

@abc.abstractmethod
def get_network(self, network_id: str) -> dict | None:
"""
Get network information by ID or name.

Args:
network_id: Network ID or name

Returns:
Dictionary with network info including labels, or None if not found
"""
raise NotImplementedError()
Loading