From d9c5be811d45d6efe355771f64713db0cb3bcbb6 Mon Sep 17 00:00:00 2001 From: Antonin Stefanutti Date: Mon, 29 Sep 2025 09:07:32 +0200 Subject: [PATCH 1/2] chore(test): Support e2e cluster setup with Podman Signed-off-by: Antonin Stefanutti --- hack/e2e-setup-cluster.sh | 24 ++++++++++++++++----- hack/e2e-setup-gpu-cluster.sh | 34 ++++++++++++++++++------------ hack/scripts/load-image-to-kind.sh | 33 +++++++++++++++++++++++++++++ 3 files changed, 73 insertions(+), 18 deletions(-) create mode 100644 hack/scripts/load-image-to-kind.sh diff --git a/hack/e2e-setup-cluster.sh b/hack/e2e-setup-cluster.sh index 406995c318..2377c5de03 100755 --- a/hack/e2e-setup-cluster.sh +++ b/hack/e2e-setup-cluster.sh @@ -21,6 +21,14 @@ set -o nounset set -o pipefail set -x +# Source container runtime utilities +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "${SCRIPT_DIR}/scripts/container-runtime.sh" +source "${SCRIPT_DIR}/scripts/load-image-to-kind.sh" + +# Setup container runtime +setup_container_runtime + # Configure variables. KIND=${KIND:-./bin/kind} K8S_VERSION=${K8S_VERSION:-1.32.0} @@ -34,11 +42,13 @@ CONTROLLER_MANAGER_CI_IMAGE_NAME="ghcr.io/kubeflow/trainer/trainer-controller-ma CONTROLLER_MANAGER_CI_IMAGE_TAG="test" CONTROLLER_MANAGER_CI_IMAGE="${CONTROLLER_MANAGER_CI_IMAGE_NAME}:${CONTROLLER_MANAGER_CI_IMAGE_TAG}" echo "Build Kubeflow Trainer images" -docker build . -f cmd/trainer-controller-manager/Dockerfile -t ${CONTROLLER_MANAGER_CI_IMAGE} +${CONTAINER_RUNTIME} build . -f cmd/trainer-controller-manager/Dockerfile -t ${CONTROLLER_MANAGER_CI_IMAGE} echo "Create Kind cluster and load Kubeflow Trainer images" ${KIND} create cluster --image "${KIND_NODE_VERSION}" -${KIND} load docker-image ${CONTROLLER_MANAGER_CI_IMAGE} + +# Load Trainer controller manager image in KinD +load_image_to_kind ${CONTROLLER_MANAGER_CI_IMAGE} echo "Deploy Kubeflow Trainer control plane" E2E_MANIFESTS_DIR="artifacts/e2e/manifests" @@ -86,8 +96,12 @@ kubectl apply --server-side -k manifests/overlays/runtimes || ( TORCH_RUNTIME_IMAGE=pytorch/pytorch:2.7.1-cuda12.8-cudnn9-runtime DEEPSPEED_RUNTIME_IMAGE=ghcr.io/kubeflow/trainer/deepspeed-runtime:latest -docker pull ${TORCH_RUNTIME_IMAGE} -docker pull ${DEEPSPEED_RUNTIME_IMAGE} -${KIND} load docker-image ${TORCH_RUNTIME_IMAGE} ${DEEPSPEED_RUNTIME_IMAGE} +# Load Torch runtime image in KinD +${CONTAINER_RUNTIME} pull ${TORCH_RUNTIME_IMAGE} +load_image_to_kind ${TORCH_RUNTIME_IMAGE} + +# Load DeepSpeed runtime image in KinD +${CONTAINER_RUNTIME} pull ${DEEPSPEED_RUNTIME_IMAGE} +load_image_to_kind ${DEEPSPEED_RUNTIME_IMAGE} print_cluster_info diff --git a/hack/e2e-setup-gpu-cluster.sh b/hack/e2e-setup-gpu-cluster.sh index 0a2b74a142..57b2fb2b70 100755 --- a/hack/e2e-setup-gpu-cluster.sh +++ b/hack/e2e-setup-gpu-cluster.sh @@ -21,8 +21,16 @@ set -o nounset set -o pipefail set -x +# Source container runtime utilities +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "${SCRIPT_DIR}/scripts/container-runtime.sh" +source "${SCRIPT_DIR}/scripts/load-image-to-kind.sh" + +# Setup container runtime +setup_container_runtime + # Configure variables. -KIND=${KIND:-./bin/kind} +KIND=${KIND:-nvkind} K8S_VERSION=${K8S_VERSION:-1.32.0} GPU_OPERATOR_VERSION="v25.3.2" KIND_NODE_VERSION=kindest/node:v${K8S_VERSION} @@ -35,20 +43,20 @@ CONTROLLER_MANAGER_CI_IMAGE_NAME="ghcr.io/kubeflow/trainer/trainer-controller-ma CI_IMAGE_TAG="test" CONTROLLER_MANAGER_CI_IMAGE="${CONTROLLER_MANAGER_CI_IMAGE_NAME}:${CI_IMAGE_TAG}" echo "Build Kubeflow Trainer images" -docker build . -f cmd/trainer-controller-manager/Dockerfile -t ${CONTROLLER_MANAGER_CI_IMAGE} +${CONTAINER_RUNTIME} build . -f cmd/trainer-controller-manager/Dockerfile -t ${CONTROLLER_MANAGER_CI_IMAGE} # Kubeflow Trainer initializer images. DATASET_INITIALIZER_CI_IMAGE_NAME="ghcr.io/kubeflow/trainer/dataset-initializer" DATASET_INITIALIZER_CI_IMAGE="${DATASET_INITIALIZER_CI_IMAGE_NAME}:${CI_IMAGE_TAG}" -docker build . -f cmd/initializers/dataset/Dockerfile -t ${DATASET_INITIALIZER_CI_IMAGE} +${CONTAINER_RUNTIME} build . -f cmd/initializers/dataset/Dockerfile -t ${DATASET_INITIALIZER_CI_IMAGE} MODEL_INITIALIZER_CI_IMAGE_NAME="ghcr.io/kubeflow/trainer/model-initializer" MODEL_INITIALIZER_CI_IMAGE="${MODEL_INITIALIZER_CI_IMAGE_NAME}:${CI_IMAGE_TAG}" -docker build . -f cmd/initializers/model/Dockerfile -t ${MODEL_INITIALIZER_CI_IMAGE} +${CONTAINER_RUNTIME} build . -f cmd/initializers/model/Dockerfile -t ${MODEL_INITIALIZER_CI_IMAGE} TRAINER_CI_IMAGE_NAME="ghcr.io/kubeflow/trainer/torchtune-trainer" TRAINER_CI_IMAGE="${TRAINER_CI_IMAGE_NAME}:${CI_IMAGE_TAG}" -docker build . -f cmd/trainers/torchtune/Dockerfile -t ${TRAINER_CI_IMAGE} +${CONTAINER_RUNTIME} build . -f cmd/trainers/torchtune/Dockerfile -t ${TRAINER_CI_IMAGE} # Set up Docker to use NVIDIA runtime. sudo nvidia-ctk runtime configure --runtime=docker --set-as-default --cdi.enabled @@ -56,8 +64,8 @@ sudo nvidia-ctk config --set accept-nvidia-visible-devices-as-volume-mounts=true sudo systemctl restart docker # Create a Kind cluster with GPU support. -nvkind cluster create --name ${GPU_CLUSTER_NAME} --image "${KIND_NODE_VERSION}" -nvkind cluster print-gpus +${KIND} cluster create --name ${GPU_CLUSTER_NAME} --image "${KIND_NODE_VERSION}" +${KIND} cluster print-gpus # Install gpu-operator to make sure we can run GPU workloads. echo "Install NVIDIA GPU Operator" @@ -83,12 +91,12 @@ kubectl get nodes -o=custom-columns=NAME:.metadata.name,GPU:.status.allocatable. # Load Kubeflow Trainer images echo "Load Kubeflow Trainer images" -kind load docker-image "${CONTROLLER_MANAGER_CI_IMAGE}" --name "${GPU_CLUSTER_NAME}" +load_image_to_kind "${CONTROLLER_MANAGER_CI_IMAGE}" "${GPU_CLUSTER_NAME}" echo "Load Kubeflow Trainer initializers images" -kind load docker-image "${DATASET_INITIALIZER_CI_IMAGE}" --name "${GPU_CLUSTER_NAME}" -kind load docker-image "${MODEL_INITIALIZER_CI_IMAGE}" --name "${GPU_CLUSTER_NAME}" -kind load docker-image "${TRAINER_CI_IMAGE}" --name "${GPU_CLUSTER_NAME}" +load_image_to_kind "${DATASET_INITIALIZER_CI_IMAGE}" "${GPU_CLUSTER_NAME}" +load_image_to_kind "${MODEL_INITIALIZER_CI_IMAGE}" "${GPU_CLUSTER_NAME}" +load_image_to_kind "${TRAINER_CI_IMAGE}" "${GPU_CLUSTER_NAME}" # Deploy Kubeflow Trainer control plane echo "Deploy Kubeflow Trainer control plane" @@ -153,7 +161,7 @@ kubectl apply --server-side -k "${E2E_RUNTIMES_DIR}" || ( # TODO (andreyvelich): Discuss how we want to pre-load runtime images to the Kind cluster. TORCH_RUNTIME_IMAGE=pytorch/pytorch:2.7.1-cuda12.8-cudnn9-runtime -docker pull ${TORCH_RUNTIME_IMAGE} -kind load docker-image ${TORCH_RUNTIME_IMAGE} --name ${GPU_CLUSTER_NAME} +${CONTAINER_RUNTIME} pull ${TORCH_RUNTIME_IMAGE} +load_image_to_kind ${TORCH_RUNTIME_IMAGE} ${GPU_CLUSTER_NAME} print_cluster_info diff --git a/hack/scripts/load-image-to-kind.sh b/hack/scripts/load-image-to-kind.sh new file mode 100644 index 0000000000..80c6382975 --- /dev/null +++ b/hack/scripts/load-image-to-kind.sh @@ -0,0 +1,33 @@ +#!/usr/bin/env bash + +# Copyright 2025 The Kubeflow Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Function to load container image into KinD cluster +load_image_to_kind() { + local image_name="$1" + local cluster_name="${2:-}" + local cluster_arg="" + + if [[ -n "${cluster_name}" ]]; then + cluster_arg="--name ${cluster_name}" + fi + + echo "Loading image ${image_name} into KinD cluster${cluster_name:+ ${cluster_name}}" + if [[ "${CONTAINER_RUNTIME}" == "docker" ]]; then + ${KIND} load docker-image "${image_name}" ${cluster_arg} + else + ${CONTAINER_RUNTIME} save "${image_name}" -o /dev/stdout | ${KIND} load image-archive /dev/stdin ${cluster_arg} + fi +} From 559965b50d54ca59ea6bd1e5aa56cea0d4bd40b3 Mon Sep 17 00:00:00 2001 From: Antonin Stefanutti Date: Thu, 2 Oct 2025 09:36:14 +0200 Subject: [PATCH 2/2] Fix KinD cluster creation for GPU E2E tests Signed-off-by: Antonin Stefanutti --- hack/e2e-setup-gpu-cluster.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/hack/e2e-setup-gpu-cluster.sh b/hack/e2e-setup-gpu-cluster.sh index 57b2fb2b70..55d558c811 100755 --- a/hack/e2e-setup-gpu-cluster.sh +++ b/hack/e2e-setup-gpu-cluster.sh @@ -30,7 +30,7 @@ source "${SCRIPT_DIR}/scripts/load-image-to-kind.sh" setup_container_runtime # Configure variables. -KIND=${KIND:-nvkind} +KIND=${KIND:-kind} K8S_VERSION=${K8S_VERSION:-1.32.0} GPU_OPERATOR_VERSION="v25.3.2" KIND_NODE_VERSION=kindest/node:v${K8S_VERSION} @@ -64,8 +64,8 @@ sudo nvidia-ctk config --set accept-nvidia-visible-devices-as-volume-mounts=true sudo systemctl restart docker # Create a Kind cluster with GPU support. -${KIND} cluster create --name ${GPU_CLUSTER_NAME} --image "${KIND_NODE_VERSION}" -${KIND} cluster print-gpus +nvkind cluster create --name ${GPU_CLUSTER_NAME} --image "${KIND_NODE_VERSION}" +nvkind cluster print-gpus # Install gpu-operator to make sure we can run GPU workloads. echo "Install NVIDIA GPU Operator"