From 93a21d96658fec3adad8b5f529103cf8050f1d89 Mon Sep 17 00:00:00 2001 From: Brian Gallagher Date: Tue, 10 Jun 2025 20:52:44 +0100 Subject: [PATCH 1/6] add e2e ntoebook tests Signed-off-by: Brian Gallagher --- .github/workflows/test-e2e.yaml | 91 ++++++++++++++++++++++++++++ Makefile | 33 +++++++++++ hack/e2e-run-notebook.sh | 49 ++++++++++++++++ hack/e2e-setup-cluster.sh | 101 ++++++++++++++++++++++++++++++++ 4 files changed, 274 insertions(+) create mode 100644 .github/workflows/test-e2e.yaml create mode 100644 Makefile create mode 100755 hack/e2e-run-notebook.sh create mode 100755 hack/e2e-setup-cluster.sh diff --git a/.github/workflows/test-e2e.yaml b/.github/workflows/test-e2e.yaml new file mode 100644 index 000000000..5bbf9edea --- /dev/null +++ b/.github/workflows/test-e2e.yaml @@ -0,0 +1,91 @@ +name: E2E Test + +on: + pull_request + +jobs: + e2e-test: + name: E2E Test + runs-on: + labels: ubuntu-latest-16-cores + + strategy: + fail-fast: false + matrix: + kubernetes-version: ["1.29.14", "1.30.0", "1.31.0", "1.32.3"] + + steps: + - name: Checkout Kubeflow SDK repository + uses: actions/checkout@v4 + + # Checkout the Kubeflow Trainer repository in order to get the Go and KIND versions. + - name: Checkout Kubeflow Trainer repo for Go version + uses: actions/checkout@v4 + with: + repository: kubeflow/trainer + ref: master + path: trainer + + # Step 3: Get Go and KIND versions from the trainer repo's go.mod + # These versions will be set as environment variables for subsequent steps. + - name: Get Go and KIND versions from trainer repo + run: | + echo "Extracting Go version from trainer/go.mod..." + GO_VERSION=$(grep '^go ' ./trainer/go.mod | awk '{print $2}' | tr -d '\n') + echo "Detected Go version from trainer/go.mod: $GO_VERSION" + echo "GO_VERSION=$GO_VERSION" >> "$GITHUB_ENV" + + echo "Extracting KIND version from trainer/go.mod..." + # This specifically looks for 'sigs.k8s.io/kind' and extracts its version + KIND_VERSION=$(grep 'sigs.k8s.io/kind' ./trainer/go.mod | awk '{print $2}' | tr -d '\n') + echo "Detected KIND version from trainer/go.mod: $KIND_VERSION" + echo "KIND_VERSION=$KIND_VERSION" >> "$GITHUB_ENV" + + - name: Setup Go + uses: actions/setup-go@v5 + with: + go-version: ${{ env.GO_VERSION }} # Use the GO_VERSION environment variable + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: 3.11 + + - name: Install Python dependencies + run: | + echo "Installing Papermill and Jupyter" + pip install papermill==2.6.0 jupyter==1.1.1 ipykernel==6.29.5 + + echo "Installing Kubeflow SDK locally from ./python" + # This path (./python) is relative to the *main* repository checkout (kubeflow/sdk) + pip install ./python + working-directory: . # Ensure pip runs from the SDK repo root + + - name: Setup cluster + run: | + make test-e2e-setup-cluster \ + K8S_VERSION=${{ matrix.kubernetes-version }} \ + KIND_VERSION=${{ env.KIND_VERSION }} + working-directory: . # Execute make from the root of the SDK repo + + - name: Run e2e test for example Notebooks + run: | + mkdir -p artifacts/notebooks # Create the output directory + # Execute make commands, passing notebook paths and output locations + make test-e2e-notebook \ + NOTEBOOK_INPUT=./examples/training/pytorch/image-classification/mnist.ipynb \ + NOTEBOOK_OUTPUT=./artifacts/notebooks/${{ matrix.kubernetes-version }}_mnist.ipynb \ + PAPERMILL_TIMEOUT=900 + make test-e2e-notebook \ + NOTEBOOK_INPUT=./examples/training/pytorch/question-answering/fine-tune-distilbert.ipynb \ + NOTEBOOK_OUTPUT=./artifacts/notebooks/${{ matrix.kubernetes-version }}_fine-tune-distilbert.ipynb \ + PAPERMILL_TIMEOUT=900 + working-directory: . # Execute make from the root of the SDK repo + + - name: Upload Artifacts to GitHub + uses: actions/upload-artifact@v4 + if: always() # Ensure artifacts are uploaded even if previous steps fail + with: + name: ${{ matrix.kubernetes-version }} + path: ./artifacts/notebooks/* # Path relative to the workspace root + retention-days: 1 # diff --git a/Makefile b/Makefile new file mode 100644 index 000000000..4fd0624ce --- /dev/null +++ b/Makefile @@ -0,0 +1,33 @@ +# Get the currently used golang install path (in GOPATH/bin, unless GOBIN is set) +ifeq (,$(shell go env GOBIN)) +GOBIN=$(shell go env GOPATH)/bin +else +GOBIN=$(shell go env GOBIN) +endif + +PROJECT_DIR := $(shell dirname $(abspath $(lastword $(MAKEFILE_LIST)))) +LOCALBIN ?= $(PROJECT_DIR)/bin + +# Tool versions +KIND_VERSION ?= $(shell go list -m -f '{{.Version}}' sigs.k8s.io/kind) +K8S_VERSION ?= 1.32.0 + +# Tool binaries +KIND ?= $(LOCALBIN)/kind + +# Input and output location for Notebooks executed with Papermill. +NOTEBOOK_INPUT=$(PROJECT_DIR)/examples/training/pytorch/image-classification/mnist.ipynb +NOTEBOOK_OUTPUT=$(PROJECT_DIR)/artifacts/notebooks/trainer_output.ipynb +PAPERMILL_TIMEOUT=900 + +.PHONY: kind +kind: ## Download Kind binary if required. + GOBIN=$(LOCALBIN) go install sigs.k8s.io/kind@$(KIND_VERSION) + +.PHONY: test-e2e-notebook +test-e2e-notebook: ## Run Jupyter Notebook with Papermill. + NOTEBOOK_INPUT=$(NOTEBOOK_INPUT) NOTEBOOK_OUTPUT=$(NOTEBOOK_OUTPUT) PAPERMILL_TIMEOUT=$(PAPERMILL_TIMEOUT) ./hack/e2e-run-notebook.sh + +.PHONY: test-e2e-setup-cluster +test-e2e-setup-cluster: kind ## Setup Kind cluster for e2e test. + KIND=$(KIND) K8S_VERSION=$(K8S_VERSION) ./hack/e2e-setup-cluster.sh diff --git a/hack/e2e-run-notebook.sh b/hack/e2e-run-notebook.sh new file mode 100755 index 000000000..17ba6a5bd --- /dev/null +++ b/hack/e2e-run-notebook.sh @@ -0,0 +1,49 @@ +#!/usr/bin/env bash + +# Copyright 2025 The Kubeflow Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This shell is used to run Jupyter Notebook with Papermill. + +set -o errexit +set -o nounset +set -o pipefail +set -x + +if [ -z "${NOTEBOOK_INPUT}" ]; then + echo "NOTEBOOK_INPUT env variable must be set to run this script." + exit 1 +fi + +if [ -z "${NOTEBOOK_OUTPUT}" ]; then + echo "NOTEBOOK_OUTPUT env variable must be set to run this script." + exit 1 +fi + +if [ -z "${PAPERMILL_TIMEOUT}" ]; then + echo "PAPERMILL_TIMEOUT env variable must be set to run this script." + exit 1 +fi + +print_results() { + kubectl get pods + kubectl describe pod + kubectl describe trainjob + kubectl logs -n kubeflow-system -l app.kubernetes.io/name=trainer + kubectl logs -l jobset.sigs.k8s.io/replicatedjob-name=trainer-node,batch.kubernetes.io/job-completion-index=0 --tail -1 + kubectl wait trainjob --for=condition=Complete --all --timeout 3s +} + +(papermill "${NOTEBOOK_INPUT}" "${NOTEBOOK_OUTPUT}" --execution-timeout "${PAPERMILL_TIMEOUT}" && print_results) || + (print_results && exit 1) diff --git a/hack/e2e-setup-cluster.sh b/hack/e2e-setup-cluster.sh new file mode 100755 index 000000000..787f8ef3e --- /dev/null +++ b/hack/e2e-setup-cluster.sh @@ -0,0 +1,101 @@ +#!/usr/bin/env bash + +# Copyright 2025 The Kubeflow Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This shell is used to setup Kind cluster for Kubeflow Trainer e2e tests. + +set -o errexit +set -o nounset +set -o pipefail +set -x + +# Configure variables. +# KUBEFLOW_TRAINER_REPO="https://github.com/kubeflow/trainer" # This is no longer needed +KIND=${KIND:-./bin/kind} +K8S_VERSION=${K8S_VERSION:-1.32.0} +KIND_NODE_VERSION=kindest/node:v${K8S_VERSION} +NAMESPACE="kubeflow-system" +TIMEOUT="5m" + +# Define the path to the already cloned Kubeflow Trainer repository +# This path is relative to where the e2e-setup-cluster.sh script is executed (which is the SDK repo root) +TRAINER_REPO_PATH="./trainer" + +# Kubeflow Trainer images. +# TODO (andreyvelich): Support initializers images. +CONTROLLER_MANAGER_CI_IMAGE_NAME="ghcr.io/kubeflow/trainer/trainer-controller-manager" +CONTROLLER_MANAGER_CI_IMAGE_TAG="test" +CONTROLLER_MANAGER_CI_IMAGE="${CONTROLLER_MANAGER_CI_IMAGE_NAME}:${CONTROLLER_MANAGER_CI_IMAGE_TAG}" + +echo "Build Kubeflow Trainer images" +# Change 'docker build .' to 'docker build "${TRAINER_REPO_PATH}"' +# And specify the Dockerfile path relative to the TRAINER_REPO_PATH +docker build "${TRAINER_REPO_PATH}" -f "${TRAINER_REPO_PATH}"/cmd/trainer-controller-manager/Dockerfile -t "${CONTROLLER_MANAGER_CI_IMAGE}" + +echo "Create Kind cluster and load Kubeflow Trainer images" +"${KIND}" create cluster --image "${KIND_NODE_VERSION}" +"${KIND}" load docker-image "${CONTROLLER_MANAGER_CI_IMAGE}" + +echo "Deploy Kubeflow Trainer control plane and Jobset controller" +E2E_MANIFESTS_DIR="artifacts/e2e/manifests" +mkdir -p "${E2E_MANIFESTS_DIR}" +cat < "${E2E_MANIFESTS_DIR}/kustomization.yaml" + apiVersion: kustomize.config.k8s.io/v1beta1 + kind: Kustomization + resources: + - "../../../trainer/manifests/overlays/manager" + images: + - name: "${CONTROLLER_MANAGER_CI_IMAGE_NAME}" + newTag: "${CONTROLLER_MANAGER_CI_IMAGE_TAG}" +EOF + +# Ensure kubectl applies from the correct context. +# The `E2E_MANIFESTS_DIR` is relative to where the script is run (SDK repo root). +kubectl apply --server-side -k "${E2E_MANIFESTS_DIR}" + +# We should wait until Deployment is in Ready status. +echo "Wait for Kubeflow Trainer to be ready" +(kubectl wait deploy/kubeflow-trainer-controller-manager --for=condition=available -n "${NAMESPACE}" --timeout "${TIMEOUT}" && + kubectl wait pods --for=condition=ready -n "${NAMESPACE}" --timeout "${TIMEOUT}" --all) || + ( + echo "Failed to wait until Kubeflow Trainer is ready" && + kubectl get pods -n "${NAMESPACE}" && + kubectl describe pods -n "${NAMESPACE}" && + exit 1 + ) + +print_cluster_info() { + kubectl version + kubectl cluster-info + kubectl get nodes + kubectl get pods -n "${NAMESPACE}" + kubectl describe pod -n "${NAMESPACE}" +} + +# TODO (andreyvelich): Currently, we print manager logs due to flaky test. +echo "Deploy Kubeflow Trainer runtimes" +# Adjust path to manifests/overlays/runtimes to be relative to the cloned Trainer repo +kubectl apply --server-side -k "${TRAINER_REPO_PATH}/manifests/overlays/runtimes" || ( + kubectl logs -n "${NAMESPACE}" -l app.kubernetes.io/name=trainer && + print_cluster_info && + exit 1 +) + +# TODO (andreyvelich): Discuss how we want to pre-load runtime images to the Kind cluster. +TORCH_RUNTIME_IMAGE=pytorch/pytorch:2.5.0-cuda12.4-cudnn9-runtime +docker pull "${TORCH_RUNTIME_IMAGE}" +"${KIND}" load docker-image "${TORCH_RUNTIME_IMAGE}" + +print_cluster_info From 3a9ea54ecbcba9589e70ebc55c97c6f49e5edbea Mon Sep 17 00:00:00 2001 From: Brian Gallagher Date: Fri, 13 Jun 2025 14:12:07 +0100 Subject: [PATCH 2/6] address comments in review for e2e test Signed-off-by: Brian Gallagher --- .github/workflows/test-e2e.yaml | 4 ++-- Makefile | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test-e2e.yaml b/.github/workflows/test-e2e.yaml index 5bbf9edea..fbe66810f 100644 --- a/.github/workflows/test-e2e.yaml +++ b/.github/workflows/test-e2e.yaml @@ -73,11 +73,11 @@ jobs: mkdir -p artifacts/notebooks # Create the output directory # Execute make commands, passing notebook paths and output locations make test-e2e-notebook \ - NOTEBOOK_INPUT=./examples/training/pytorch/image-classification/mnist.ipynb \ + NOTEBOOK_INPUT=./trainer/examples/pytorch/image-classification/mnist.ipynb \ NOTEBOOK_OUTPUT=./artifacts/notebooks/${{ matrix.kubernetes-version }}_mnist.ipynb \ PAPERMILL_TIMEOUT=900 make test-e2e-notebook \ - NOTEBOOK_INPUT=./examples/training/pytorch/question-answering/fine-tune-distilbert.ipynb \ + NOTEBOOK_INPUT=./trainer/examples/pytorch/question-answering/fine-tune-distilbert.ipynb \ NOTEBOOK_OUTPUT=./artifacts/notebooks/${{ matrix.kubernetes-version }}_fine-tune-distilbert.ipynb \ PAPERMILL_TIMEOUT=900 working-directory: . # Execute make from the root of the SDK repo diff --git a/Makefile b/Makefile index 4fd0624ce..38668ef56 100644 --- a/Makefile +++ b/Makefile @@ -9,7 +9,7 @@ PROJECT_DIR := $(shell dirname $(abspath $(lastword $(MAKEFILE_LIST)))) LOCALBIN ?= $(PROJECT_DIR)/bin # Tool versions -KIND_VERSION ?= $(shell go list -m -f '{{.Version}}' sigs.k8s.io/kind) +KIND_VERSION ?= v0.27.0 K8S_VERSION ?= 1.32.0 # Tool binaries From 8bc9499a33be629046453050526314f4dad65fb7 Mon Sep 17 00:00:00 2001 From: Brian Gallagher Date: Fri, 11 Jul 2025 12:36:27 +0100 Subject: [PATCH 3/6] addressing review comments Signed-off-by: Brian Gallagher --- .github/workflows/test-e2e.yaml | 2 +- hack/e2e-setup-cluster.sh | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/test-e2e.yaml b/.github/workflows/test-e2e.yaml index fbe66810f..db2e58161 100644 --- a/.github/workflows/test-e2e.yaml +++ b/.github/workflows/test-e2e.yaml @@ -12,7 +12,7 @@ jobs: strategy: fail-fast: false matrix: - kubernetes-version: ["1.29.14", "1.30.0", "1.31.0", "1.32.3"] + kubernetes-version: ["1.30.0", "1.31.0", "1.32.3", "1.33.2"] steps: - name: Checkout Kubeflow SDK repository diff --git a/hack/e2e-setup-cluster.sh b/hack/e2e-setup-cluster.sh index 787f8ef3e..c1a995686 100755 --- a/hack/e2e-setup-cluster.sh +++ b/hack/e2e-setup-cluster.sh @@ -22,7 +22,6 @@ set -o pipefail set -x # Configure variables. -# KUBEFLOW_TRAINER_REPO="https://github.com/kubeflow/trainer" # This is no longer needed KIND=${KIND:-./bin/kind} K8S_VERSION=${K8S_VERSION:-1.32.0} KIND_NODE_VERSION=kindest/node:v${K8S_VERSION} From bce90c027bc5fbddb5c045151c180d9d62bf0897 Mon Sep 17 00:00:00 2001 From: Brian Gallagher Date: Mon, 14 Jul 2025 17:10:57 +0100 Subject: [PATCH 4/6] running cluster setup and e2e tests from checked out trainer repo Signed-off-by: Brian Gallagher --- .github/workflows/test-e2e.yaml | 32 ++-------- Makefile | 33 ----------- hack/e2e-run-notebook.sh | 49 ---------------- hack/e2e-setup-cluster.sh | 100 -------------------------------- 4 files changed, 6 insertions(+), 208 deletions(-) delete mode 100644 Makefile delete mode 100755 hack/e2e-run-notebook.sh delete mode 100755 hack/e2e-setup-cluster.sh diff --git a/.github/workflows/test-e2e.yaml b/.github/workflows/test-e2e.yaml index db2e58161..4113f7f9c 100644 --- a/.github/workflows/test-e2e.yaml +++ b/.github/workflows/test-e2e.yaml @@ -18,7 +18,6 @@ jobs: - name: Checkout Kubeflow SDK repository uses: actions/checkout@v4 - # Checkout the Kubeflow Trainer repository in order to get the Go and KIND versions. - name: Checkout Kubeflow Trainer repo for Go version uses: actions/checkout@v4 with: @@ -26,26 +25,6 @@ jobs: ref: master path: trainer - # Step 3: Get Go and KIND versions from the trainer repo's go.mod - # These versions will be set as environment variables for subsequent steps. - - name: Get Go and KIND versions from trainer repo - run: | - echo "Extracting Go version from trainer/go.mod..." - GO_VERSION=$(grep '^go ' ./trainer/go.mod | awk '{print $2}' | tr -d '\n') - echo "Detected Go version from trainer/go.mod: $GO_VERSION" - echo "GO_VERSION=$GO_VERSION" >> "$GITHUB_ENV" - - echo "Extracting KIND version from trainer/go.mod..." - # This specifically looks for 'sigs.k8s.io/kind' and extracts its version - KIND_VERSION=$(grep 'sigs.k8s.io/kind' ./trainer/go.mod | awk '{print $2}' | tr -d '\n') - echo "Detected KIND version from trainer/go.mod: $KIND_VERSION" - echo "KIND_VERSION=$KIND_VERSION" >> "$GITHUB_ENV" - - - name: Setup Go - uses: actions/setup-go@v5 - with: - go-version: ${{ env.GO_VERSION }} # Use the GO_VERSION environment variable - - name: Setup Python uses: actions/setup-python@v5 with: @@ -63,22 +42,23 @@ jobs: - name: Setup cluster run: | + cd ./trainer make test-e2e-setup-cluster \ K8S_VERSION=${{ matrix.kubernetes-version }} \ - KIND_VERSION=${{ env.KIND_VERSION }} working-directory: . # Execute make from the root of the SDK repo - name: Run e2e test for example Notebooks run: | mkdir -p artifacts/notebooks # Create the output directory + cd ./trainer # Execute make commands, passing notebook paths and output locations make test-e2e-notebook \ - NOTEBOOK_INPUT=./trainer/examples/pytorch/image-classification/mnist.ipynb \ - NOTEBOOK_OUTPUT=./artifacts/notebooks/${{ matrix.kubernetes-version }}_mnist.ipynb \ + NOTEBOOK_INPUT=./examples/pytorch/image-classification/mnist.ipynb \ + NOTEBOOK_OUTPUT=../artifacts/notebooks/${{ matrix.kubernetes-version }}_mnist.ipynb \ PAPERMILL_TIMEOUT=900 make test-e2e-notebook \ - NOTEBOOK_INPUT=./trainer/examples/pytorch/question-answering/fine-tune-distilbert.ipynb \ - NOTEBOOK_OUTPUT=./artifacts/notebooks/${{ matrix.kubernetes-version }}_fine-tune-distilbert.ipynb \ + NOTEBOOK_INPUT=./examples/pytorch/question-answering/fine-tune-distilbert.ipynb \ + NOTEBOOK_OUTPUT=../artifacts/notebooks/${{ matrix.kubernetes-version }}_fine-tune-distilbert.ipynb \ PAPERMILL_TIMEOUT=900 working-directory: . # Execute make from the root of the SDK repo diff --git a/Makefile b/Makefile deleted file mode 100644 index 38668ef56..000000000 --- a/Makefile +++ /dev/null @@ -1,33 +0,0 @@ -# Get the currently used golang install path (in GOPATH/bin, unless GOBIN is set) -ifeq (,$(shell go env GOBIN)) -GOBIN=$(shell go env GOPATH)/bin -else -GOBIN=$(shell go env GOBIN) -endif - -PROJECT_DIR := $(shell dirname $(abspath $(lastword $(MAKEFILE_LIST)))) -LOCALBIN ?= $(PROJECT_DIR)/bin - -# Tool versions -KIND_VERSION ?= v0.27.0 -K8S_VERSION ?= 1.32.0 - -# Tool binaries -KIND ?= $(LOCALBIN)/kind - -# Input and output location for Notebooks executed with Papermill. -NOTEBOOK_INPUT=$(PROJECT_DIR)/examples/training/pytorch/image-classification/mnist.ipynb -NOTEBOOK_OUTPUT=$(PROJECT_DIR)/artifacts/notebooks/trainer_output.ipynb -PAPERMILL_TIMEOUT=900 - -.PHONY: kind -kind: ## Download Kind binary if required. - GOBIN=$(LOCALBIN) go install sigs.k8s.io/kind@$(KIND_VERSION) - -.PHONY: test-e2e-notebook -test-e2e-notebook: ## Run Jupyter Notebook with Papermill. - NOTEBOOK_INPUT=$(NOTEBOOK_INPUT) NOTEBOOK_OUTPUT=$(NOTEBOOK_OUTPUT) PAPERMILL_TIMEOUT=$(PAPERMILL_TIMEOUT) ./hack/e2e-run-notebook.sh - -.PHONY: test-e2e-setup-cluster -test-e2e-setup-cluster: kind ## Setup Kind cluster for e2e test. - KIND=$(KIND) K8S_VERSION=$(K8S_VERSION) ./hack/e2e-setup-cluster.sh diff --git a/hack/e2e-run-notebook.sh b/hack/e2e-run-notebook.sh deleted file mode 100755 index 17ba6a5bd..000000000 --- a/hack/e2e-run-notebook.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env bash - -# Copyright 2025 The Kubeflow Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# This shell is used to run Jupyter Notebook with Papermill. - -set -o errexit -set -o nounset -set -o pipefail -set -x - -if [ -z "${NOTEBOOK_INPUT}" ]; then - echo "NOTEBOOK_INPUT env variable must be set to run this script." - exit 1 -fi - -if [ -z "${NOTEBOOK_OUTPUT}" ]; then - echo "NOTEBOOK_OUTPUT env variable must be set to run this script." - exit 1 -fi - -if [ -z "${PAPERMILL_TIMEOUT}" ]; then - echo "PAPERMILL_TIMEOUT env variable must be set to run this script." - exit 1 -fi - -print_results() { - kubectl get pods - kubectl describe pod - kubectl describe trainjob - kubectl logs -n kubeflow-system -l app.kubernetes.io/name=trainer - kubectl logs -l jobset.sigs.k8s.io/replicatedjob-name=trainer-node,batch.kubernetes.io/job-completion-index=0 --tail -1 - kubectl wait trainjob --for=condition=Complete --all --timeout 3s -} - -(papermill "${NOTEBOOK_INPUT}" "${NOTEBOOK_OUTPUT}" --execution-timeout "${PAPERMILL_TIMEOUT}" && print_results) || - (print_results && exit 1) diff --git a/hack/e2e-setup-cluster.sh b/hack/e2e-setup-cluster.sh deleted file mode 100755 index c1a995686..000000000 --- a/hack/e2e-setup-cluster.sh +++ /dev/null @@ -1,100 +0,0 @@ -#!/usr/bin/env bash - -# Copyright 2025 The Kubeflow Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# This shell is used to setup Kind cluster for Kubeflow Trainer e2e tests. - -set -o errexit -set -o nounset -set -o pipefail -set -x - -# Configure variables. -KIND=${KIND:-./bin/kind} -K8S_VERSION=${K8S_VERSION:-1.32.0} -KIND_NODE_VERSION=kindest/node:v${K8S_VERSION} -NAMESPACE="kubeflow-system" -TIMEOUT="5m" - -# Define the path to the already cloned Kubeflow Trainer repository -# This path is relative to where the e2e-setup-cluster.sh script is executed (which is the SDK repo root) -TRAINER_REPO_PATH="./trainer" - -# Kubeflow Trainer images. -# TODO (andreyvelich): Support initializers images. -CONTROLLER_MANAGER_CI_IMAGE_NAME="ghcr.io/kubeflow/trainer/trainer-controller-manager" -CONTROLLER_MANAGER_CI_IMAGE_TAG="test" -CONTROLLER_MANAGER_CI_IMAGE="${CONTROLLER_MANAGER_CI_IMAGE_NAME}:${CONTROLLER_MANAGER_CI_IMAGE_TAG}" - -echo "Build Kubeflow Trainer images" -# Change 'docker build .' to 'docker build "${TRAINER_REPO_PATH}"' -# And specify the Dockerfile path relative to the TRAINER_REPO_PATH -docker build "${TRAINER_REPO_PATH}" -f "${TRAINER_REPO_PATH}"/cmd/trainer-controller-manager/Dockerfile -t "${CONTROLLER_MANAGER_CI_IMAGE}" - -echo "Create Kind cluster and load Kubeflow Trainer images" -"${KIND}" create cluster --image "${KIND_NODE_VERSION}" -"${KIND}" load docker-image "${CONTROLLER_MANAGER_CI_IMAGE}" - -echo "Deploy Kubeflow Trainer control plane and Jobset controller" -E2E_MANIFESTS_DIR="artifacts/e2e/manifests" -mkdir -p "${E2E_MANIFESTS_DIR}" -cat < "${E2E_MANIFESTS_DIR}/kustomization.yaml" - apiVersion: kustomize.config.k8s.io/v1beta1 - kind: Kustomization - resources: - - "../../../trainer/manifests/overlays/manager" - images: - - name: "${CONTROLLER_MANAGER_CI_IMAGE_NAME}" - newTag: "${CONTROLLER_MANAGER_CI_IMAGE_TAG}" -EOF - -# Ensure kubectl applies from the correct context. -# The `E2E_MANIFESTS_DIR` is relative to where the script is run (SDK repo root). -kubectl apply --server-side -k "${E2E_MANIFESTS_DIR}" - -# We should wait until Deployment is in Ready status. -echo "Wait for Kubeflow Trainer to be ready" -(kubectl wait deploy/kubeflow-trainer-controller-manager --for=condition=available -n "${NAMESPACE}" --timeout "${TIMEOUT}" && - kubectl wait pods --for=condition=ready -n "${NAMESPACE}" --timeout "${TIMEOUT}" --all) || - ( - echo "Failed to wait until Kubeflow Trainer is ready" && - kubectl get pods -n "${NAMESPACE}" && - kubectl describe pods -n "${NAMESPACE}" && - exit 1 - ) - -print_cluster_info() { - kubectl version - kubectl cluster-info - kubectl get nodes - kubectl get pods -n "${NAMESPACE}" - kubectl describe pod -n "${NAMESPACE}" -} - -# TODO (andreyvelich): Currently, we print manager logs due to flaky test. -echo "Deploy Kubeflow Trainer runtimes" -# Adjust path to manifests/overlays/runtimes to be relative to the cloned Trainer repo -kubectl apply --server-side -k "${TRAINER_REPO_PATH}/manifests/overlays/runtimes" || ( - kubectl logs -n "${NAMESPACE}" -l app.kubernetes.io/name=trainer && - print_cluster_info && - exit 1 -) - -# TODO (andreyvelich): Discuss how we want to pre-load runtime images to the Kind cluster. -TORCH_RUNTIME_IMAGE=pytorch/pytorch:2.5.0-cuda12.4-cudnn9-runtime -docker pull "${TORCH_RUNTIME_IMAGE}" -"${KIND}" load docker-image "${TORCH_RUNTIME_IMAGE}" - -print_cluster_info From d7fbbed7e879304eac92041c2c384b362d7924b9 Mon Sep 17 00:00:00 2001 From: Brian Gallagher Date: Thu, 17 Jul 2025 09:43:46 +0100 Subject: [PATCH 5/6] update kubernetes-version support Signed-off-by: Brian Gallagher --- .github/workflows/test-e2e.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test-e2e.yaml b/.github/workflows/test-e2e.yaml index 4113f7f9c..f4ba26460 100644 --- a/.github/workflows/test-e2e.yaml +++ b/.github/workflows/test-e2e.yaml @@ -12,13 +12,13 @@ jobs: strategy: fail-fast: false matrix: - kubernetes-version: ["1.30.0", "1.31.0", "1.32.3", "1.33.2"] + kubernetes-version: ["1.29.14", "1.30.0", "1.31.0", "1.32.3"] steps: - name: Checkout Kubeflow SDK repository uses: actions/checkout@v4 - - name: Checkout Kubeflow Trainer repo for Go version + - name: Checkout Kubeflow Trainer repository uses: actions/checkout@v4 with: repository: kubeflow/trainer From 23a809517829596ce0722aefa479ad9321f75507 Mon Sep 17 00:00:00 2001 From: Brian Gallagher Date: Thu, 17 Jul 2025 13:43:53 +0100 Subject: [PATCH 6/6] use oci gh arc runner for e2e test workflow Signed-off-by: Brian Gallagher --- .github/workflows/test-e2e.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test-e2e.yaml b/.github/workflows/test-e2e.yaml index f4ba26460..1200efa21 100644 --- a/.github/workflows/test-e2e.yaml +++ b/.github/workflows/test-e2e.yaml @@ -6,13 +6,13 @@ on: jobs: e2e-test: name: E2E Test - runs-on: - labels: ubuntu-latest-16-cores + runs-on: oracle-vm-16cpu-64gb-x86-64 strategy: fail-fast: false matrix: kubernetes-version: ["1.29.14", "1.30.0", "1.31.0", "1.32.3"] + trainer-ref: ["master"] steps: - name: Checkout Kubeflow SDK repository @@ -22,7 +22,7 @@ jobs: uses: actions/checkout@v4 with: repository: kubeflow/trainer - ref: master + ref: ${{ matrix.trainer-ref }} path: trainer - name: Setup Python