Add a multi-slice E2E test and update torch_xla pin to 20250313 (#146)

tengyifei · web-flow · commit 3c59038905c2 · 2025-03-13T15:18:09.000-07:00
* Add a multi-slice E2E test and update torch_xla to 20250313

20250313 docker contains the fixes for multi-slice training and we add
an E2E test to make sure this doesn't regress.

* Add more help
diff --git a/.github/workflows/cpu_test.yml b/.github/workflows/cpu_test.yml
@@ -15,7 +15,7 @@ jobs:
       matrix:
         python-version: ["3.10", "3.11"]
     container:
-      image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_${{ matrix.python-version }}_tpuvm_cxx11_20250312
+      image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_${{ matrix.python-version }}_tpuvm_cxx11_20250313
     steps:
     - uses: actions/checkout@v4
     - name: Install torchax
diff --git a/.github/workflows/e2e_test.yml b/.github/workflows/e2e_test.yml
@@ -17,6 +17,7 @@ jobs:
     outputs:
       llama-3-8b-name: ${{ steps.run-llama-3-8b.outputs.name }}
       llama-3-8b-2d-name: ${{ steps.run-llama-3-8b-2d.outputs.name }}
+      llama-3-8b-2-slice-name: ${{ steps.run-llama-3-8b-2-slice.outputs.name }}
       mixtral-8x7b-name: ${{ steps.run-mixtral-8x7b.outputs.name }}
       artifact-dir: ${{ steps.artifacts.outputs.artifact_dir }}
     steps:
@@ -102,6 +103,28 @@ jobs:
             profile_step=3 \
             max_steps=15
 
+      - name: Run Llama 3.0 8B (2 slice)
+        id: run-llama-3-8b-2-slice
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+          XLA_IR_DEBUG: 1
+          XLA_HLO_DEBUG: 1
+        run: |
+          name=$(e2e_testing/gen_name.py llama-3-8b-2-slice)
+          echo "name=$name" >> "$GITHUB_OUTPUT"
+          tp run \
+            --name $name \
+            --num-slices 2 \
+            torchprime/torch_xla_models/train.py \
+            model=llama-3-8b \
+            model/scaling=llama-fsdp \
+            global_batch_size=16 \
+            dcn_mesh.fsdp=2 \
+            ici_mesh.fsdp=4 \
+            dataset_config_name=wikitext-2-raw-v1 \
+            profile_step=3 \
+            max_steps=15
+
   llama-3-8b:
     name: Llama 3.0 8B
     needs: tp-run
@@ -120,6 +143,15 @@ jobs:
       artifact_dir: ${{ needs.tp-run.outputs.artifact-dir }}
     secrets: inherit
 
+  llama-3-8b-2-slice:
+    name: Llama 3.0 8B (2 slice)
+    needs: tp-run
+    uses: ./.github/workflows/reusable_e2e_check.yml
+    with:
+      jobset_name: ${{ needs.tp-run.outputs.llama-3-8b-2-slice-name }}
+      artifact_dir: ${{ needs.tp-run.outputs.artifact-dir }}
+    secrets: inherit
+
   mixtral-8x7b:
     name: Mixtral 8x7B
     needs: tp-run
diff --git a/README.md b/README.md
@@ -99,6 +99,7 @@ tp run torchprime/experimental/torchax_models/run.py global_batch_size=256
 
 `tp run` will broadcast the specified command to all VMs in the XPK cluster,
 which is the convention for running SPMD distributed workloads.
+See `tp run --help` for more advanced features.
 
 #### Env vars passed to the workload
 
diff --git a/torchprime/launcher/Dockerfile b/torchprime/launcher/Dockerfile
@@ -1,6 +1,6 @@
 # syntax=docker/dockerfile:experimental
 # Use torch_xla Python 3.10 as the base image
-FROM us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_cxx11_20250312
+FROM us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_cxx11_20250313
 
 ARG USE_TRANSFORMERS=false
 ARG USE_LOCAL_WHEEL=false
diff --git a/torchprime/launcher/cli.py b/torchprime/launcher/cli.py
@@ -70,8 +70,9 @@ def cli(ctx, interactive):
 @click.option(
   "--num-slices",
   required=False,
+  type=int,
   default=1,
-  help="Number of TPU slice to use. Defaults to 1",
+  help="Number of TPU slice to use by default. Defaults to 1",
 )
 @click.option(
   "--tpu-type",
@@ -207,14 +208,24 @@ def create_and_activate_gcloud(gcloud_config_name, config: Config):
   "defaults to one based on the date and time.",
   default=None,
 )
+@click.option(
+  "--num-slices",
+  required=False,
+  type=int,
+  default=None,
+  help="Temporarily override the number of TPU slice to use for this run. "
+  "If unspecified, `tp run` will use the slice count configured in `tp use`.",
+)
 @click.option("--use-hf", is_flag=True, help="Use HuggingFace transformer")
 @click.option(
   "--use-local-wheel",
   is_flag=True,
   help="Use local torch and torch_xla wheels under folder local_dist/",
 )
 @interactive
-def run(args, name: str | None, use_hf: bool, use_local_wheel: bool):
+def run(
+  args, name: str | None, num_slices: int | None, use_hf: bool, use_local_wheel: bool
+):
   """
   Runs the provided SPMD training command as an xpk job on a GKE cluster.
   """
@@ -258,6 +269,9 @@ def run(args, name: str | None, use_hf: bool, use_local_wheel: bool):
     f"TORCHPRIME_JOBSET_NAME={workload_name}",
   ]
 
+  if num_slices is None:
+    num_slices = config.num_slices
+
   ensure_command("xpk")
   xpk_command = (
     [
@@ -273,7 +287,7 @@ def run(args, name: str | None, use_hf: bool, use_local_wheel: bool):
       "--tpu-type",
       config.tpu_type,
       "--num-slices",
-      str(config.num_slices),
+      str(num_slices),
       "--zone",
       config.zone,
       "--project",