KEP-2170: Add PyTorch DDP MNIST training example #126
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: E2E Test with train API | |
| on: | |
| - pull_request | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.ref }} | |
| cancel-in-progress: true | |
| jobs: | |
| e2e-test: | |
| runs-on: ubuntu-latest | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| kubernetes-version: ["v1.31.4"] | |
| python-version: ["3.9", "3.10", "3.11"] | |
| steps: | |
| - name: Checkout | |
| uses: actions/checkout@v4 | |
| - name: Setup E2E Tests | |
| uses: ./.github/workflows/setup-e2e-test | |
| with: | |
| kubernetes-version: ${{ matrix.kubernetes-version }} | |
| python-version: ${{ matrix.python-version }} | |
| - name: Build trainer | |
| run: | | |
| ./scripts/gha/build-trainer.sh | |
| env: | |
| TRAINER_CI_IMAGE: kubeflowtraining/trainer:test | |
| - name: Load trainer | |
| run: | | |
| kind load docker-image ${{ env.TRAINER_CI_IMAGE }} --name ${{ env.KIND_CLUSTER }} | |
| env: | |
| KIND_CLUSTER: training-operator-cluster | |
| TRAINER_CI_IMAGE: kubeflowtraining/trainer:test | |
| - name: Build storage initializer | |
| run: | | |
| ./scripts/gha/build-storage-initializer.sh | |
| env: | |
| STORAGE_INITIALIZER_CI_IMAGE: kubeflowtraining/storage-initializer:test | |
| TRAINER_CI_IMAGE: kubeflowtraining/trainer:test | |
| - name: Load storage initializer | |
| run: | | |
| kind load docker-image ${{ env.STORAGE_INITIALIZER_CI_IMAGE }} --name ${{ env.KIND_CLUSTER }} | |
| env: | |
| KIND_CLUSTER: training-operator-cluster | |
| STORAGE_INITIALIZER_CI_IMAGE: kubeflowtraining/storage-initializer:test | |
| - name: Run tests | |
| run: | | |
| pip install pytest | |
| python3 -m pip install -e sdk/python[huggingface] | |
| pytest -s sdk/python/test/e2e-fine-tune-llm/test_e2e_pytorch_fine_tune_llm.py --log-cli-level=debug | |
| env: | |
| STORAGE_INITIALIZER_IMAGE: kubeflowtraining/storage-initializer:test | |
| TRAINER_TRANSFORMER_IMAGE: kubeflowtraining/trainer:test |