Skip to content

Added an example Notebook to fine-tune Llama3 model using PyTorchJob #2052

Added an example Notebook to fine-tune Llama3 model using PyTorchJob

Added an example Notebook to fine-tune Llama3 model using PyTorchJob #2052

name: integration test
on:
- pull_request
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
integration-test:
runs-on: ubuntu-latest
# Almost similar to the following:
#
# ```yaml
# strategy:
# fail-fast: false
# matrix:
# kubernetes-version: ["v1.27.11", "v1.28.7", "v1.29.2"]
# gang-scheduler-name: ["none", "scheduler-plugins", "volcano"]
# ```
# The difference is that each combination is randomly assigned various Python versions
# to verify Python SDK operations.
strategy:
fail-fast: false
matrix:
# TODO (tenzen-y): Add volcano.
include:
- kubernetes-version: v1.30.6
gang-scheduler-name: "none"
python-version: "3.10"
- kubernetes-version: v1.29.2
gang-scheduler-name: "none"
python-version: "3.10"
- kubernetes-version: v1.28.7
gang-scheduler-name: "none"
python-version: "3.8"
- kubernetes-version: v1.30.6
gang-scheduler-name: "scheduler-plugins"
python-version: "3.10"
- kubernetes-version: v1.29.2
gang-scheduler-name: "scheduler-plugins"
python-version: "3.9"
- kubernetes-version: v1.28.7
gang-scheduler-name: "scheduler-plugins"
python-version: "3.11"
- kubernetes-version: v1.30.6
gang-scheduler-name: "volcano"
python-version: "3.10"
- kubernetes-version: v1.29.2
gang-scheduler-name: "volcano"
python-version: "3.8"
- kubernetes-version: v1.28.7
gang-scheduler-name: "volcano"
python-version: "3.10"
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Setup E2E Tests
uses: ./.github/workflows/setup-e2e-test
with:
kubernetes-version: ${{ matrix.kubernetes-version }}
python-version: ${{ matrix.python-version }}
gang-scheduler-name: ${{ matrix.gang-scheduler-name }}
- name: Build JAX Job Example Image
run: |
./scripts/gha/build-jax-mnist-image.sh
env:
JAX_JOB_CI_IMAGE: kubeflow/jaxjob-dist-spmd-mnist:test
- name: Load JAX Job Example Image
run: |
kind load docker-image ${{ env.JAX_JOB_CI_IMAGE }} --name ${{ env.KIND_CLUSTER }}
env:
KIND_CLUSTER: training-operator-cluster
JAX_JOB_CI_IMAGE: kubeflow/jaxjob-dist-spmd-mnist:test
- name: Run tests
run: |
pip install pytest
python3 -m pip install -e sdk/python; pytest -s sdk/python/test/e2e --log-cli-level=debug --namespace=default
env:
GANG_SCHEDULER_NAME: ${{ matrix.gang-scheduler-name }}
JAX_JOB_IMAGE: kubeflow/jaxjob-dist-spmd-mnist:test
- name: Collect volcano logs
if: ${{ failure() && matrix.gang-scheduler-name == 'volcano' }}
run: |
echo "dump volcano-scheduler logs..."
kubectl logs -n volcano-system -l app=volcano-scheduler --tail=-1
echo "dump volcano-admission logs..."
kubectl logs -n volcano-system -l app=volcano-admission --tail=-1
echo "dump volcano-controllers logs..."
kubectl logs -n volcano-system -l app=volcano-controller --tail=-1
echo "dump podgroups description..."
kubectl describe podgroups.scheduling.volcano.sh -A