kubeflow · Fiona-Waters · Sep 29, 2025 · Oct 3, 2025 · Oct 9, 2025 · Oct 14, 2025
diff --git a/README.md b/README.md
@@ -71,6 +71,45 @@ TrainerClient().wait_for_job_status(job_id)
 print("\n".join(TrainerClient().get_job_logs(name=job_id)))
 ```
 
+## Local Development
+
+Kubeflow SDK provides first-class support for local development, allowing you to test and iterate on your models without needing a Kubernetes cluster.
+
+### Execution Backends
+
+Choose the backend that fits your development workflow:
+
+| Backend | Description | Use Case |
+|---------|-------------|----------|
+| **KubernetesBackend** | Run jobs on Kubernetes cluster | Production, multi-node distributed training |
+| **ContainerBackend** | Auto-detects Docker or Podman | Local development with container isolation |
+| **LocalProcessBackend** | Run as local Python subprocesses | Quick prototyping, debugging |
+
+### Local Container Execution
+
+The **ContainerBackend** automatically detects and uses either Docker or Podman:
+
+```bash
+# Install with Docker support
+pip install kubeflow[docker]
+
+# Or install with Podman support
+pip install kubeflow[podman]
+```
+
+```python
+from kubeflow.trainer import TrainerClient, ContainerBackendConfig, CustomTrainer
+
+# Auto-detects Docker or Podman
+config = ContainerBackendConfig()
+client = TrainerClient(backend_config=config)
+
+# Your training runs in isolated containers
+job_id = client.train(trainer=CustomTrainer(func=train_fn))
+```
+
+For detailed configuration options and platform-specific setup (macOS, Linux), see the [ContainerBackend documentation](kubeflow/trainer/backends/container/README.md).
+
 ## Supported Kubeflow Projects
 
 | Project                     | Status           | Version Support | Description                                                          |

diff --git a/kubeflow/trainer/__init__.py b/kubeflow/trainer/__init__.py
@@ -15,6 +15,7 @@
 
 # Import the Kubeflow Trainer client.
 from kubeflow.trainer.api.trainer_client import TrainerClient  # noqa: F401
+from kubeflow.trainer.backends.container.types import ContainerBackendConfig
 
 # import backends and its associated configs
 from kubeflow.trainer.backends.kubernetes.types import KubernetesBackendConfig
@@ -58,5 +59,6 @@
     "TrainerClient",
     "TrainerType",
     "LocalProcessBackendConfig",
+    "ContainerBackendConfig",
     "KubernetesBackendConfig",
 ]
diff --git a/kubeflow/trainer/api/trainer_client.py b/kubeflow/trainer/api/trainer_client.py
@@ -16,6 +16,8 @@
 import logging
 from typing import Optional, Union
 
+from kubeflow.trainer.backends.container.backend import ContainerBackend
+from kubeflow.trainer.backends.container.types import ContainerBackendConfig
 from kubeflow.trainer.backends.kubernetes.backend import KubernetesBackend
 from kubeflow.trainer.backends.kubernetes.types import KubernetesBackendConfig
 from kubeflow.trainer.backends.localprocess.backend import (
@@ -31,14 +33,19 @@
 class TrainerClient:
     def __init__(
         self,
-        backend_config: Union[KubernetesBackendConfig, LocalProcessBackendConfig] = None,
+        backend_config: Union[
+            KubernetesBackendConfig,
+            LocalProcessBackendConfig,
+            ContainerBackendConfig,
+        ] = None,
     ):
         """Initialize a Kubeflow Trainer client.
 
         Args:
-            backend_config: Backend configuration. Either KubernetesBackendConfig or
-                            LocalProcessBackendConfig, or None to use the backend's
-                            default config class. Defaults to KubernetesBackendConfig.
+            backend_config: Backend configuration. Either KubernetesBackendConfig,
+                            LocalProcessBackendConfig, ContainerBackendConfig,
+                            or None to use the backend's default config class.
+                            Defaults to KubernetesBackendConfig.
 
         Raises:
             ValueError: Invalid backend configuration.
@@ -52,6 +59,8 @@ def __init__(
             self.backend = KubernetesBackend(backend_config)
         elif isinstance(backend_config, LocalProcessBackendConfig):
             self.backend = LocalProcessBackend(backend_config)
+        elif isinstance(backend_config, ContainerBackendConfig):
+            self.backend = ContainerBackend(backend_config)
         else:
             raise ValueError(f"Invalid backend config '{backend_config}'")
 

diff --git a/kubeflow/trainer/backends/container/__init__.py b/kubeflow/trainer/backends/container/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2025 The Kubeflow Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from kubeflow.trainer.backends.container.backend import ContainerBackend
+from kubeflow.trainer.backends.container.types import ContainerBackendConfig
+
+__all__ = ["ContainerBackend", "ContainerBackendConfig"]
diff --git a/kubeflow/trainer/backends/container/adapters/__init__.py b/kubeflow/trainer/backends/container/adapters/__init__.py
@@ -0,0 +1,31 @@
+# Copyright 2025 The Kubeflow Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Container runtime adapters.
+
+This package provides adapter implementations for different container runtimes
+(Docker and Podman), allowing the Container backend to work with either runtime
+through a unified interface.
+"""
+
+from kubeflow.trainer.backends.container.adapters.base import BaseContainerClientAdapter
+from kubeflow.trainer.backends.container.adapters.docker import DockerClientAdapter
+from kubeflow.trainer.backends.container.adapters.podman import PodmanClientAdapter
+
+__all__ = [
+    "BaseContainerClientAdapter",
+    "DockerClientAdapter",
+    "PodmanClientAdapter",
+]
diff --git a/kubeflow/trainer/backends/container/adapters/base.py b/kubeflow/trainer/backends/container/adapters/base.py
@@ -0,0 +1,196 @@
+# Copyright 2025 The Kubeflow Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Container client adapters for Docker and Podman.
+
+This module implements the adapter pattern to abstract away differences between
+Docker and Podman APIs, allowing the backend to work with either runtime through
+a common interface.
+"""
+
+from __future__ import annotations
+
+import abc
+from collections.abc import Iterator
+
+
+class BaseContainerClientAdapter(abc.ABC):
+    """
+    Abstract adapter interface for container clients.
+
+    This adapter abstracts the container runtime API, allowing the backend
+    to work with Docker and Podman through a unified interface.
+    """
+
+    @abc.abstractmethod
+    def ping(self):
+        """Test the connection to the container runtime."""
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def create_network(
+        self,
+        name: str,
+        labels: dict[str, str],
+    ) -> str:
+        """
+        Create a container network.
+
+        Args:
+            name: Network name
+            labels: Labels to attach to the network
+
+        Returns:
+            Network ID or name
+        """
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def delete_network(self, network_id: str):
+        """Delete a network."""
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def create_and_start_container(
+        self,
+        image: str,
+        command: list[str],
+        name: str,
+        network_id: str,
+        environment: dict[str, str],
+        labels: dict[str, str],
+        volumes: dict[str, dict[str, str]],
+        working_dir: str,
+        gpus: int | bool | None = None,
+    ) -> str:
+        """
+        Create and start a container.
+
+        Args:
+            image: Container image
+            command: Command to run
+            name: Container name
+            network_id: Network to attach to
+            environment: Environment variables
+            labels: Container labels
+            volumes: Volume mounts
+            working_dir: Working directory
+            gpus: GPU configuration (None=no GPU, True=all GPUs, int=number of GPUs)
+
+        Returns:
+            Container ID
+        """
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def get_container(self, container_id: str):
+        """Get container object by ID."""
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def container_logs(self, container_id: str, follow: bool) -> Iterator[str]:
+        """Stream logs from a container."""
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def stop_container(self, container_id: str, timeout: int = 10):
+        """Stop a container."""
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def remove_container(self, container_id: str, force: bool = True):
+        """Remove a container."""
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def pull_image(self, image: str):
+        """Pull an image."""
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def image_exists(self, image: str) -> bool:
+        """Check if an image exists locally."""
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def run_oneoff_container(self, image: str, command: list[str]) -> str:
+        """
+        Run a short-lived container and return its output.
+
+        Args:
+            image: Container image
+            command: Command to run
+
+        Returns:
+            Container output as string
+        """
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def container_status(self, container_id: str) -> tuple[str, int | None]:
+        """
+        Get container status.
+
+        Returns:
+            Tuple of (status_string, exit_code)
+            Status strings: "running", "created", "exited", etc.
+            Exit code is None if container hasn't exited
+        """
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def get_container_ip(self, container_id: str, network_id: str) -> str | None:
+        """
+        Get container's IP address on a specific network.
+
+        Args:
+            container_id: Container ID
+            network_id: Network name or ID
+
+        Returns:
+            IP address string or None if not found
+        """
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def list_containers(self, filters: dict[str, str] | None = None) -> list[dict]:
+        """
+        List containers, optionally filtered by labels.
+
+        Args:
+            filters: Dictionary of filters (e.g., {"label": ["key=value"]})
+
+        Returns:
+            List of container info dictionaries with keys:
+            - id: Container ID
+            - name: Container name
+            - labels: Dictionary of labels
+            - status: Container status
+            - created: Creation timestamp
+        """
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def get_network(self, network_id: str) -> dict | None:
+        """
+        Get network information by ID or name.
+
+        Args:
+            network_id: Network ID or name
+
+        Returns:
+            Dictionary with network info including labels, or None if not found
+        """
+        raise NotImplementedError()