Skip to content

KEP-2170: Add PyTorch DDP MNIST training example #125

KEP-2170: Add PyTorch DDP MNIST training example

KEP-2170: Add PyTorch DDP MNIST training example #125

name: E2E Test with train API
on:
- pull_request
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
e2e-test:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
kubernetes-version: ["v1.31.4"]
python-version: ["3.9", "3.10", "3.11"]
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Setup E2E Tests
uses: ./.github/workflows/setup-e2e-test
with:
kubernetes-version: ${{ matrix.kubernetes-version }}
python-version: ${{ matrix.python-version }}
- name: Build trainer
run: |
./scripts/gha/build-trainer.sh
env:
TRAINER_CI_IMAGE: kubeflowtraining/trainer:test
- name: Load trainer
run: |
kind load docker-image ${{ env.TRAINER_CI_IMAGE }} --name ${{ env.KIND_CLUSTER }}
env:
KIND_CLUSTER: training-operator-cluster
TRAINER_CI_IMAGE: kubeflowtraining/trainer:test
- name: Build storage initializer
run: |
./scripts/gha/build-storage-initializer.sh
env:
STORAGE_INITIALIZER_CI_IMAGE: kubeflowtraining/storage-initializer:test
TRAINER_CI_IMAGE: kubeflowtraining/trainer:test
- name: Load storage initializer
run: |
kind load docker-image ${{ env.STORAGE_INITIALIZER_CI_IMAGE }} --name ${{ env.KIND_CLUSTER }}
env:
KIND_CLUSTER: training-operator-cluster
STORAGE_INITIALIZER_CI_IMAGE: kubeflowtraining/storage-initializer:test
- name: Run tests
run: |
pip install pytest
python3 -m pip install -e sdk/python[huggingface]
pytest -s sdk/python/test/e2e-fine-tune-llm/test_e2e_pytorch_fine_tune_llm.py --log-cli-level=debug
env:
STORAGE_INITIALIZER_IMAGE: kubeflowtraining/storage-initializer:test
TRAINER_TRANSFORMER_IMAGE: kubeflowtraining/trainer:test