kubeflow · google-oss-prow · Aug 4, 2025 · Aug 1, 2025 · Aug 1, 2025 · Aug 1, 2025
diff --git a/examples/deepspeed/text-summarization/T5-Fine-Tuning.ipynb b/examples/deepspeed/text-summarization/T5-Fine-Tuning.ipynb
@@ -20,40 +20,45 @@
    ]
   },
   {
-   "metadata": {},
    "cell_type": "markdown",
+   "id": "1c461b2984e77d99",
+   "metadata": {},
    "source": [
     "## Install the Kubeflow SDK\n",
     "\n",
     "You need to install the Kubeflow SDK to interact with Kubeflow Trainer APIs:"
-   ],
-   "id": "1c461b2984e77d99"
+   ]
   },
   {
-   "metadata": {},
    "cell_type": "code",
-   "outputs": [],
    "execution_count": null,
-   "source": "# !pip install git+https://github.com/kubeflow/sdk.git@main#subdirectory=python",
-   "id": "4900404c5d532bdf"
+   "id": "4900404c5d532bdf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# !pip install git+https://github.com/kubeflow/sdk.git@main#subdirectory=python"
+   ]
   },
   {
-   "metadata": {},
    "cell_type": "markdown",
+   "id": "47534ee4955f3ff6",
+   "metadata": {},
    "source": [
     "## Create Script to Fine-Tune T5 with DeepSpeed\n",
     "\n",
     "We need to wrap our fine-tuning script into a function to create Kubeflow TrainJob."
-   ],
-   "id": "47534ee4955f3ff6"
+   ]
   },
   {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "35f06c45b614ecd0",
    "metadata": {
     "jupyter": {
      "is_executing": true
     }
    },
-   "cell_type": "code",
+   "outputs": [],
    "source": [
     "def deepspeed_train_t5(args):\n",
     "    import os\n",
@@ -227,10 +232,7 @@
     "        file_path = os.path.join(HOME_PATH, \"global_step94/mp_rank_00_model_states.pt\")\n",
     "        bucket = boto3.resource(\"s3\").Bucket(args[\"BUCKET\"])\n",
     "        bucket.upload_file(file_path, f\"deepspeed/{file_path}\")"
-   ],
-   "id": "35f06c45b614ecd0",
-   "outputs": [],
-   "execution_count": null
+   ]
   },
   {
    "cell_type": "markdown",
@@ -242,7 +244,7 @@
     "\n",
     "Get available Kubeflow Trainer Runtimes with the `list_runtimes()` API.\n",
     "\n",
-    "You can inspect Runtime details, including the name, framework, entry point, and number of accelerators.\n",
+    "You can inspect Runtime details, including the name, framework, and number of accelerators.\n",
     "\n",
     "- Runtimes with **CustomTrainer**: You must write the training script within the function.\n",
     "\n",
@@ -251,7 +253,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 3,
    "id": "51d8bc1d-8d9b-48f7-866f-c6ad4ad4241b",
    "metadata": {},
    "outputs": [
@@ -261,21 +263,17 @@
      "text": [
       "Name: deepspeed-distributed, Framework: deepspeed, Trainer Type: CustomTrainer\n",
       "\n",
-      "Entrypoint: ['mpirun', '--hostfile', '/etc/mpi/hostfile']\n",
-      "\n",
-      "Runtime Accelerators: 4 x gpu-tesla-v100-16gb\n"
+      "Runtime Accelerators: 4\n"
      ]
     }
    ],
    "source": [
     "from kubeflow.trainer import TrainerClient, CustomTrainer\n",
     "\n",
     "for r in TrainerClient().list_runtimes():\n",
-    "    print(f\"Name: {r.name}, Framework: {r.trainer.framework.value}, Trainer Type: {r.trainer.trainer_type.value}\\n\")\n",
-    "    print(f\"Entrypoint: {r.trainer.entrypoint[:3]}\\n\")\n",
-    "    print(f\"Runtime Accelerators: {r.trainer.accelerator_count} x {r.trainer.accelerator}\")\n",
-    "\n",
     "    if r.name == \"deepspeed-distributed\":\n",
+    "        print(f\"Name: {r.name}, Framework: {r.trainer.framework}, Trainer Type: {r.trainer.trainer_type.value}\\n\")\n",
+    "        print(f\"Runtime Accelerators: {r.trainer.accelerator_count}\")\n",
     "        deepspeed_runtime = r"
    ]
   },
@@ -1066,7 +1064,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.11"
+   "version": "3.11.13"
   }
  },
  "nbformat": 4,

diff --git a/examples/mlx/image-classification/MLX-Distributed-Mnist.ipynb b/examples/mlx/image-classification/MLX-Distributed-Mnist.ipynb
@@ -67,22 +67,24 @@
    ]
   },
   {
-   "metadata": {},
    "cell_type": "markdown",
+   "id": "3cb9ab4c8e12221a",
+   "metadata": {},
    "source": [
     "## Install the Kubeflow SDK\n",
     "\n",
     "You need to install the Kubeflow SDK to interact with Kubeflow Trainer APIs:"
-   ],
-   "id": "3cb9ab4c8e12221a"
+   ]
   },
   {
-   "metadata": {},
    "cell_type": "code",
-   "outputs": [],
    "execution_count": null,
-   "source": "# !pip install git+https://github.com/kubeflow/sdk.git@main#subdirectory=python",
-   "id": "bd62189280760f42"
+   "id": "bd62189280760f42",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# !pip install git+https://github.com/kubeflow/sdk.git@main#subdirectory=python"
+   ]
   },
   {
    "cell_type": "markdown",
@@ -233,7 +235,7 @@
     "\n",
     "Get available Kubeflow Trainer Runtimes with the `list_runtimes()` API.\n",
     "\n",
-    "You can inspect Runtime details, including the name, framework, entry point, and number of accelerators.\n",
+    "You can inspect Runtime details, including the name, framework, and number of accelerators.\n",
     "\n",
     "- Runtimes with **CustomTrainer**: You must write the training script within the function.\n",
     "\n",
@@ -242,7 +244,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 4,
    "id": "51d8bc1d-8d9b-48f7-866f-c6ad4ad4241b",
    "metadata": {},
    "outputs": [
@@ -251,8 +253,6 @@
      "output_type": "stream",
      "text": [
       "Name: mlx-distributed, Framework: mlx, Trainer Type: CustomTrainer\n",
-      "\n",
-      "Entrypoint: ['mpirun', '--hostfile', '/etc/mpi/hostfile']\n",
       "\n"
      ]
     }
@@ -261,10 +261,8 @@
     "from kubeflow.trainer import TrainerClient, CustomTrainer\n",
     "\n",
     "for r in TrainerClient().list_runtimes():\n",
-    "    print(f\"Name: {r.name}, Framework: {r.trainer.framework.value}, Trainer Type: {r.trainer.trainer_type.value}\\n\")\n",
-    "    print(f\"Entrypoint: {r.trainer.entrypoint[:3]}\\n\")\n",
-    "\n",
     "    if r.name == \"mlx-distributed\":\n",
+    "        print(f\"Name: {r.name}, Framework: {r.trainer.framework}, Trainer Type: {r.trainer.trainer_type.value}\\n\")\n",
     "        mlx_runtime = r"
    ]
   },
@@ -725,7 +723,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.13.2"
+   "version": "3.11.13"
   }
  },
  "nbformat": 4,

diff --git a/examples/pytorch/image-classification/mnist.ipynb b/examples/pytorch/image-classification/mnist.ipynb
@@ -355,7 +355,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 2,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -381,14 +381,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Runtime(name='torch-distributed', trainer=Trainer(trainer_type=<TrainerType.CUSTOM_TRAINER: 'CustomTrainer'>, framework=<Framework.TORCH: 'torch'>, entrypoint='torchrun', accelerator='gpu-tesla-v100-16gb', accelerator_count='4.0'), pretrained_model=None)\n"
+      "Runtime(name='deepspeed-distributed', trainer=RuntimeTrainer(trainer_type=<TrainerType.CUSTOM_TRAINER: 'CustomTrainer'>, framework='deepspeed', num_nodes=1, accelerator_count=4), pretrained_model=None)\n",
+      "Runtime(name='mlx-distributed', trainer=RuntimeTrainer(trainer_type=<TrainerType.CUSTOM_TRAINER: 'CustomTrainer'>, framework='mlx', num_nodes=1, accelerator_count=1), pretrained_model=None)\n",
+      "Runtime(name='torch-distributed', trainer=RuntimeTrainer(trainer_type=<TrainerType.CUSTOM_TRAINER: 'CustomTrainer'>, framework='torch', num_nodes=1, accelerator_count='Unknown'), pretrained_model=None)\n",
+      "Runtime(name='torchtune-llama3.2-1b', trainer=RuntimeTrainer(trainer_type=<TrainerType.BUILTIN_TRAINER: 'BuiltinTrainer'>, framework='torchtune', num_nodes=1, accelerator_count='2.0'), pretrained_model=None)\n",
+      "Runtime(name='torchtune-llama3.2-3b', trainer=RuntimeTrainer(trainer_type=<TrainerType.BUILTIN_TRAINER: 'BuiltinTrainer'>, framework='torchtune', num_nodes=1, accelerator_count='2.0'), pretrained_model=None)\n"
      ]
     }
    ],
@@ -444,32 +448,14 @@
     "You can get the individual status for each of these steps."
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import time\n",
-    "\n",
-    "def wait_for_job_running():\n",
-    "    for _ in range(100):\n",
-    "        trainjob = client.get_job(name=job_name)\n",
-    "        for c in trainjob.steps:\n",
-    "            if c.name == \"trainer-node-0\" and c.status == \"Running\":\n",
-    "                return\n",
-    "        print(\"Wait for TrainJob running status. Sleep for 5 seconds\")\n",
-    "        time.sleep(5)"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# TODO (andreyvelich): Use wait_for_job_status API from TrainerClient() when it is implemented.\n",
-    "wait_for_job_running()"
+    "# Wait for the running status.\n",
+    "client.wait_for_job_status(name=job_name, status={\"Running\"})"
    ]
   },
   {
@@ -609,7 +595,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "mlx",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -623,7 +609,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.13.2"
+   "version": "3.11.13"
   }
  },
  "nbformat": 4,