diff --git a/examples/deepspeed/text-summarization/T5-Fine-Tuning.ipynb b/examples/deepspeed/text-summarization/T5-Fine-Tuning.ipynb index 3e386b288f..b692368483 100644 --- a/examples/deepspeed/text-summarization/T5-Fine-Tuning.ipynb +++ b/examples/deepspeed/text-summarization/T5-Fine-Tuning.ipynb @@ -20,40 +20,45 @@ ] }, { - "metadata": {}, "cell_type": "markdown", + "id": "1c461b2984e77d99", + "metadata": {}, "source": [ "## Install the Kubeflow SDK\n", "\n", "You need to install the Kubeflow SDK to interact with Kubeflow Trainer APIs:" - ], - "id": "1c461b2984e77d99" + ] }, { - "metadata": {}, "cell_type": "code", - "outputs": [], "execution_count": null, - "source": "# !pip install git+https://github.com/kubeflow/sdk.git@main#subdirectory=python", - "id": "4900404c5d532bdf" + "id": "4900404c5d532bdf", + "metadata": {}, + "outputs": [], + "source": [ + "# !pip install git+https://github.com/kubeflow/sdk.git@main#subdirectory=python" + ] }, { - "metadata": {}, "cell_type": "markdown", + "id": "47534ee4955f3ff6", + "metadata": {}, "source": [ "## Create Script to Fine-Tune T5 with DeepSpeed\n", "\n", "We need to wrap our fine-tuning script into a function to create Kubeflow TrainJob." - ], - "id": "47534ee4955f3ff6" + ] }, { + "cell_type": "code", + "execution_count": 1, + "id": "35f06c45b614ecd0", "metadata": { "jupyter": { "is_executing": true } }, - "cell_type": "code", + "outputs": [], "source": [ "def deepspeed_train_t5(args):\n", " import os\n", @@ -227,10 +232,7 @@ " file_path = os.path.join(HOME_PATH, \"global_step94/mp_rank_00_model_states.pt\")\n", " bucket = boto3.resource(\"s3\").Bucket(args[\"BUCKET\"])\n", " bucket.upload_file(file_path, f\"deepspeed/{file_path}\")" - ], - "id": "35f06c45b614ecd0", - "outputs": [], - "execution_count": null + ] }, { "cell_type": "markdown", @@ -242,7 +244,7 @@ "\n", "Get available Kubeflow Trainer Runtimes with the `list_runtimes()` API.\n", "\n", - "You can inspect Runtime details, including the name, framework, entry point, and number of accelerators.\n", + "You can inspect Runtime details, including the name, framework, and number of accelerators.\n", "\n", "- Runtimes with **CustomTrainer**: You must write the training script within the function.\n", "\n", @@ -251,7 +253,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "id": "51d8bc1d-8d9b-48f7-866f-c6ad4ad4241b", "metadata": {}, "outputs": [ @@ -261,9 +263,7 @@ "text": [ "Name: deepspeed-distributed, Framework: deepspeed, Trainer Type: CustomTrainer\n", "\n", - "Entrypoint: ['mpirun', '--hostfile', '/etc/mpi/hostfile']\n", - "\n", - "Runtime Accelerators: 4 x gpu-tesla-v100-16gb\n" + "Runtime Accelerators: 4\n" ] } ], @@ -271,11 +271,9 @@ "from kubeflow.trainer import TrainerClient, CustomTrainer\n", "\n", "for r in TrainerClient().list_runtimes():\n", - " print(f\"Name: {r.name}, Framework: {r.trainer.framework.value}, Trainer Type: {r.trainer.trainer_type.value}\\n\")\n", - " print(f\"Entrypoint: {r.trainer.entrypoint[:3]}\\n\")\n", - " print(f\"Runtime Accelerators: {r.trainer.accelerator_count} x {r.trainer.accelerator}\")\n", - "\n", " if r.name == \"deepspeed-distributed\":\n", + " print(f\"Name: {r.name}, Framework: {r.trainer.framework}, Trainer Type: {r.trainer.trainer_type.value}\\n\")\n", + " print(f\"Runtime Accelerators: {r.trainer.accelerator_count}\")\n", " deepspeed_runtime = r" ] }, @@ -1066,7 +1064,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.11" + "version": "3.11.13" } }, "nbformat": 4, diff --git a/examples/mlx/image-classification/MLX-Distributed-Mnist.ipynb b/examples/mlx/image-classification/MLX-Distributed-Mnist.ipynb index 9a39532093..a3ebc8209e 100644 --- a/examples/mlx/image-classification/MLX-Distributed-Mnist.ipynb +++ b/examples/mlx/image-classification/MLX-Distributed-Mnist.ipynb @@ -67,22 +67,24 @@ ] }, { - "metadata": {}, "cell_type": "markdown", + "id": "3cb9ab4c8e12221a", + "metadata": {}, "source": [ "## Install the Kubeflow SDK\n", "\n", "You need to install the Kubeflow SDK to interact with Kubeflow Trainer APIs:" - ], - "id": "3cb9ab4c8e12221a" + ] }, { - "metadata": {}, "cell_type": "code", - "outputs": [], "execution_count": null, - "source": "# !pip install git+https://github.com/kubeflow/sdk.git@main#subdirectory=python", - "id": "bd62189280760f42" + "id": "bd62189280760f42", + "metadata": {}, + "outputs": [], + "source": [ + "# !pip install git+https://github.com/kubeflow/sdk.git@main#subdirectory=python" + ] }, { "cell_type": "markdown", @@ -233,7 +235,7 @@ "\n", "Get available Kubeflow Trainer Runtimes with the `list_runtimes()` API.\n", "\n", - "You can inspect Runtime details, including the name, framework, entry point, and number of accelerators.\n", + "You can inspect Runtime details, including the name, framework, and number of accelerators.\n", "\n", "- Runtimes with **CustomTrainer**: You must write the training script within the function.\n", "\n", @@ -242,7 +244,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 4, "id": "51d8bc1d-8d9b-48f7-866f-c6ad4ad4241b", "metadata": {}, "outputs": [ @@ -251,8 +253,6 @@ "output_type": "stream", "text": [ "Name: mlx-distributed, Framework: mlx, Trainer Type: CustomTrainer\n", - "\n", - "Entrypoint: ['mpirun', '--hostfile', '/etc/mpi/hostfile']\n", "\n" ] } @@ -261,10 +261,8 @@ "from kubeflow.trainer import TrainerClient, CustomTrainer\n", "\n", "for r in TrainerClient().list_runtimes():\n", - " print(f\"Name: {r.name}, Framework: {r.trainer.framework.value}, Trainer Type: {r.trainer.trainer_type.value}\\n\")\n", - " print(f\"Entrypoint: {r.trainer.entrypoint[:3]}\\n\")\n", - "\n", " if r.name == \"mlx-distributed\":\n", + " print(f\"Name: {r.name}, Framework: {r.trainer.framework}, Trainer Type: {r.trainer.trainer_type.value}\\n\")\n", " mlx_runtime = r" ] }, @@ -725,7 +723,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.13.2" + "version": "3.11.13" } }, "nbformat": 4, diff --git a/examples/pytorch/image-classification/mnist.ipynb b/examples/pytorch/image-classification/mnist.ipynb index 4f7a93d176..97a206a50d 100644 --- a/examples/pytorch/image-classification/mnist.ipynb +++ b/examples/pytorch/image-classification/mnist.ipynb @@ -355,7 +355,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 2, "metadata": { "pycharm": { "name": "#%%\n" @@ -381,14 +381,18 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Runtime(name='torch-distributed', trainer=Trainer(trainer_type=, framework=, entrypoint='torchrun', accelerator='gpu-tesla-v100-16gb', accelerator_count='4.0'), pretrained_model=None)\n" + "Runtime(name='deepspeed-distributed', trainer=RuntimeTrainer(trainer_type=, framework='deepspeed', num_nodes=1, accelerator_count=4), pretrained_model=None)\n", + "Runtime(name='mlx-distributed', trainer=RuntimeTrainer(trainer_type=, framework='mlx', num_nodes=1, accelerator_count=1), pretrained_model=None)\n", + "Runtime(name='torch-distributed', trainer=RuntimeTrainer(trainer_type=, framework='torch', num_nodes=1, accelerator_count='Unknown'), pretrained_model=None)\n", + "Runtime(name='torchtune-llama3.2-1b', trainer=RuntimeTrainer(trainer_type=, framework='torchtune', num_nodes=1, accelerator_count='2.0'), pretrained_model=None)\n", + "Runtime(name='torchtune-llama3.2-3b', trainer=RuntimeTrainer(trainer_type=, framework='torchtune', num_nodes=1, accelerator_count='2.0'), pretrained_model=None)\n" ] } ], @@ -444,32 +448,14 @@ "You can get the individual status for each of these steps." ] }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "import time\n", - "\n", - "def wait_for_job_running():\n", - " for _ in range(100):\n", - " trainjob = client.get_job(name=job_name)\n", - " for c in trainjob.steps:\n", - " if c.name == \"trainer-node-0\" and c.status == \"Running\":\n", - " return\n", - " print(\"Wait for TrainJob running status. Sleep for 5 seconds\")\n", - " time.sleep(5)" - ] - }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ - "# TODO (andreyvelich): Use wait_for_job_status API from TrainerClient() when it is implemented.\n", - "wait_for_job_running()" + "# Wait for the running status.\n", + "client.wait_for_job_status(name=job_name, status={\"Running\"})" ] }, { @@ -609,7 +595,7 @@ ], "metadata": { "kernelspec": { - "display_name": "mlx", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -623,7 +609,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.13.2" + "version": "3.11.13" } }, "nbformat": 4, diff --git a/examples/pytorch/question-answering/fine-tune-distilbert.ipynb b/examples/pytorch/question-answering/fine-tune-distilbert.ipynb index c17f685a09..b08333862f 100644 --- a/examples/pytorch/question-answering/fine-tune-distilbert.ipynb +++ b/examples/pytorch/question-answering/fine-tune-distilbert.ipynb @@ -20,27 +20,35 @@ "cell_type": "markdown", "id": "c31bc8f2", "metadata": {}, - "source": "# Install the Kubeflow SDK" + "source": [ + "# Install the Kubeflow SDK" + ] }, { - "metadata": {}, "cell_type": "markdown", - "source": "You need to install the Kubeflow SDK to interact with Kubeflow Trainer APIs:", - "id": "4504947e33b6021a" + "id": "4504947e33b6021a", + "metadata": {}, + "source": [ + "You need to install the Kubeflow SDK to interact with Kubeflow Trainer APIs:" + ] }, { - "metadata": {}, "cell_type": "code", - "outputs": [], "execution_count": null, - "source": "# !pip install git+https://github.com/kubeflow/sdk.git@main#subdirectory=python", - "id": "e5e86ea307b3eec9" + "id": "e5e86ea307b3eec9", + "metadata": {}, + "outputs": [], + "source": [ + "# !pip install git+https://github.com/kubeflow/sdk.git@main#subdirectory=python" + ] }, { - "metadata": {}, "cell_type": "markdown", - "source": "Install dependencies", - "id": "21c20e6ec87dcec2" + "id": "21c20e6ec87dcec2", + "metadata": {}, + "source": [ + "Install dependencies" + ] }, { "cell_type": "code", @@ -108,8 +116,8 @@ "Requirement already satisfied: pycparser in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from cffi>=1.12->cryptography>=2.1.4->azure-storage-blob>=12->cloudpathlib[all]) (2.22)\n", "Requirement already satisfied: pyasn1<0.7.0,>=0.6.1 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from pyasn1-modules>=0.2.1->google-auth<3.0dev,>=2.26.1->google-cloud-storage->cloudpathlib[all]) (0.6.1)\n", "\n", - "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m A new release of pip is available: \u001B[0m\u001B[31;49m25.0.1\u001B[0m\u001B[39;49m -> \u001B[0m\u001B[32;49m25.1\u001B[0m\n", - "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m To update, run: \u001B[0m\u001B[32;49mpip install --upgrade pip\u001B[0m\n" + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m25.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.1\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n" ] } ], @@ -256,22 +264,26 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "id": "bf5ab9ba-6054-40d6-839d-f84ff0fba8fc", "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Runtime mpi-distributed must have trainer.kubeflow.org/framework label.\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ "Name: deepspeed-distributed, Framework: deepspeed, Trainer Type: CustomTrainer\n", - "Entrypoint: ['mpirun', '--hostfile', '/etc/mpi/hostfile']\n", "Name: mlx-distributed, Framework: mlx, Trainer Type: CustomTrainer\n", - "Entrypoint: ['mpirun', '--hostfile', '/etc/mpi/hostfile']\n", - "Name: mpi-distributed, Framework: torch, Trainer Type: CustomTrainer\n", - "Entrypoint: ['torchrun']\n", "Name: torch-distributed, Framework: torch, Trainer Type: CustomTrainer\n", - "Entrypoint: ['torchrun']\n" + "Name: torchtune-llama3.2-1b, Framework: torchtune, Trainer Type: BuiltinTrainer\n", + "Name: torchtune-llama3.2-3b, Framework: torchtune, Trainer Type: BuiltinTrainer\n" ] } ], @@ -279,8 +291,7 @@ "from kubeflow.trainer import TrainerClient, CustomTrainer\n", "\n", "for r in TrainerClient().list_runtimes():\n", - " print(f\"Name: {r.name}, Framework: {r.trainer.framework.value}, Trainer Type: {r.trainer.trainer_type.value}\")\n", - " print(f\"Entrypoint: {r.trainer.entrypoint[:3]}\")" + " print(f\"Name: {r.name}, Framework: {r.trainer.framework}, Trainer Type: {r.trainer.trainer_type.value}\")" ] }, { @@ -368,32 +379,13 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "id": "c9f8ef95-b309-4987-9abb-760dc9c1e050", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Waiting for TrainJob running status. Sleep for 5 seconds\n" - ] - } - ], + "outputs": [], "source": [ - "# TODO (andreyvelich): Use wait_for_job_status API from TrainerClient() when it is implemented.\n", - "import time\n", - "\n", - "def wait_for_job_running():\n", - " for _ in range(100):\n", - " trainjob = TrainerClient().get_job(name=job_id)\n", - " for c in trainjob.steps:\n", - " if c.status == \"Running\":\n", - " return\n", - " print(\"Waiting for TrainJob running status. Sleep for 5 seconds\")\n", - " time.sleep(5)\n", - "\n", - "wait_for_job_running()" + "# Wait for the running status.\n", + "TrainerClient().wait_for_job_status(name=job_id, status={\"Running\"})" ] }, { @@ -457,21 +449,21 @@ " 0%| | 0/40 [00:00, framework=, entrypoint=['mpirun', '--hostfile', '/etc/mpi/hostfile', 'bash', '-c'], accelerator='Unknown', accelerator_count=4), pretrained_model=None)\n", - "Runtime(name='mlx-distributed', trainer=Trainer(trainer_type=, framework=, entrypoint=['mpirun', '--hostfile', '/etc/mpi/hostfile', 'bash', '-c'], accelerator='Unknown', accelerator_count=1), pretrained_model=None)\n", - "Runtime(name='mpi-distributed', trainer=Trainer(trainer_type=, framework=, entrypoint=['torchrun'], accelerator='Unknown', accelerator_count=1), pretrained_model=None)\n", - "Runtime(name='torch-distributed', trainer=Trainer(trainer_type=, framework=, entrypoint=['torchrun'], accelerator='Unknown', accelerator_count='Unknown'), pretrained_model=None)\n", - "Runtime(name='torchtune-llama3.2-1b', trainer=Trainer(trainer_type=, framework=, entrypoint=['tune', 'run'], accelerator='Unknown', accelerator_count='2.0'), pretrained_model=None)\n", - "Runtime(name='torchtune-llama3.2-3b', trainer=Trainer(trainer_type=, framework=, entrypoint=['tune', 'run'], accelerator='Unknown', accelerator_count='2.0'), pretrained_model=None)\n" + "Runtime(name='deepspeed-distributed', trainer=RuntimeTrainer(trainer_type=, framework='deepspeed', num_nodes=1, accelerator_count=4), pretrained_model=None)\n", + "Runtime(name='mlx-distributed', trainer=RuntimeTrainer(trainer_type=, framework='mlx', num_nodes=1, accelerator_count=1), pretrained_model=None)\n", + "Runtime(name='torch-distributed', trainer=RuntimeTrainer(trainer_type=, framework='torch', num_nodes=1, accelerator_count='Unknown'), pretrained_model=None)\n", + "Runtime(name='torchtune-llama3.2-1b', trainer=RuntimeTrainer(trainer_type=, framework='torchtune', num_nodes=1, accelerator_count='2.0'), pretrained_model=None)\n", + "Runtime(name='torchtune-llama3.2-3b', trainer=RuntimeTrainer(trainer_type=, framework='torchtune', num_nodes=1, accelerator_count='2.0'), pretrained_model=None)\n" ] } ], @@ -412,7 +411,7 @@ ], "metadata": { "kernelspec": { - "display_name": "training-operator", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -426,7 +425,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.11.13" } }, "nbformat": 4,