AI-Hypercomputer
diff --git a/‎.github/workflows/e2e_test.yml
Lines changed: 33 additions & 2 deletions b/‎.github/workflows/e2e_test.yml
Lines changed: 33 additions & 2 deletions
diff --git a/‎README.md
Lines changed: 15 additions & 3 deletions b/‎README.md
Lines changed: 15 additions & 3 deletions
diff --git a/‎torchprime/experimental/torchax_models/custom_mesh.py
Lines changed: 0 additions & 75 deletions b/‎torchprime/experimental/torchax_models/custom_mesh.py
Lines changed: 0 additions & 75 deletions
diff --git a/‎torchprime/experimental/torchax_models/run.py
Lines changed: 3 additions & 2 deletions b/‎torchprime/experimental/torchax_models/run.py
Lines changed: 3 additions & 2 deletions
diff --git a/‎torchprime/launcher/thunk.py
Lines changed: 2 additions & 1 deletion b/‎torchprime/launcher/thunk.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎torchprime/mesh/__init__.py b/‎torchprime/mesh/__init__.py
diff --git a/‎torchprime/mesh/custom_mesh.py
Lines changed: 152 additions & 0 deletions b/‎torchprime/mesh/custom_mesh.py
Lines changed: 152 additions & 0 deletions
@@ -16,6 +16,7 @@ jobs:
       ARTIFACT_DIR: gs://torchprime-e2e-tests/${{ github.job }}/${{ github.run_id }}-${{ github.run_attempt }}
     outputs:
       llama-3-8b-name: ${{ steps.run-llama-3-8b.outputs.name }}
+      llama-3-8b-2d-name: ${{ steps.run-llama-3-8b-2d.outputs.name }}
       mixtral-8x7b-name: ${{ steps.run-mixtral-8x7b.outputs.name }}
       artifact-dir: ${{ steps.artifacts.outputs.artifact_dir }}
     steps:
@@ -55,7 +56,28 @@ jobs:
             torchprime/torch_xla_models/train.py \
             model=llama-3-8b \
             global_batch_size=8 \
-            mesh.fsdp=4 \
+            ici_mesh.fsdp=4 \
+            dataset_config_name=wikitext-2-raw-v1 \
+            profile_step=3 \
+            max_steps=15
+
+      - name: Run Llama 3.0 8B (2D sharding)
+        id: run-llama-3-8b-2d
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+          XLA_IR_DEBUG: 1
+          XLA_HLO_DEBUG: 1
+        run: |
+          name=$(e2e_testing/gen_name.py llama-3-8b-2d)
+          echo "name=$name" >> "$GITHUB_OUTPUT"
+          tp run \
+            --name $name \
+            torchprime/torch_xla_models/train.py \
+            model=llama-3-8b \
+            model/scaling=llama-fsdp-tp \
+            global_batch_size=8 \
+            ici_mesh.fsdp=2 \
+            ici_mesh.tensor=2 \
             dataset_config_name=wikitext-2-raw-v1 \
             profile_step=3 \
             max_steps=15
@@ -75,7 +97,7 @@ jobs:
             model=mixtral-8x7b \
             model.num_hidden_layers=16 \
             global_batch_size=8 \
-            mesh.fsdp=4 \
+            ici_mesh.fsdp=4 \
             dataset_config_name=wikitext-2-raw-v1 \
             profile_step=3 \
             max_steps=15
@@ -89,6 +111,15 @@ jobs:
       artifact_dir: ${{ needs.tp-run.outputs.artifact-dir }}
     secrets: inherit
 
+  llama-3-8b-2d:
+    name: Llama 3.0 8B (2D sharding)
+    needs: tp-run
+    uses: ./.github/workflows/reusable_e2e_check.yml
+    with:
+      jobset_name: ${{ needs.tp-run.outputs.llama-3-8b-2d-name }}
+      artifact_dir: ${{ needs.tp-run.outputs.artifact-dir }}
+    secrets: inherit
+
   mixtral-8x7b:
     name: Mixtral 8x7B
     needs: tp-run
 
@@ -48,13 +48,13 @@ In both `torch_xla_models` and `torchax_models` directories, you'll find
 a `configs/default.yaml`. That specifies the default configuration for the
 trainer. You may override configs on the command line with a `key=value`
 syntax. For example, the following command will train Mixtral 8x7B with a
-global batch size of 256, and set the FSDP SPMD mesh axis length to 64:
+global batch size of 256, and set the FSDP SPMD ICI mesh axis length to 64:
 
 ```sh
 python3 torchprime/torch_xla_models/train.py \
     model=mixtral-8x7b \
     global_batch_size=256 \
-    mesh.fsdp=64
+    ici_mesh.fsdp=64
 ```
 
 You may refer to the hydra docs for other ways to specify configs.
@@ -81,11 +81,23 @@ tp use \
 Then prepend `tp run` to a particular Python file you would like to
 run remotely, including arguments, e.g.
 
+`torch_xla` example:
+
+```sh
+# Train Llama 3.0 8B on 256 chips
+tp run torchprime/torch_xla_models/train.py \
+    model=llama-3-8b \
+    global_batch_size=256 \
+    ici_mesh.fsdp=256
+```
+
+`torchax` example:
+
 ```sh
 tp run torchprime/experimental/torchax_models/run.py global_batch_size=256
 ```
 
-`tp run` will broadcast this command to all VMs in the XPK cluster,
+`tp run` will broadcast the specified command to all VMs in the XPK cluster,
 which is the convention for running SPMD distributed workloads.
 
 #### Env var passed to the workload
 
@@ -1,7 +1,6 @@
 import functools
 import math
 
-import custom_mesh
 import hydra
 import jax
 import numpy as np
@@ -19,6 +18,8 @@
 from omegaconf import DictConfig, OmegaConf
 from torchax import interop
 
+from torchprime.mesh import custom_mesh
+
 sharding_map_original = {
   "freqs_cis": (),  #  torch.complex64 (2048, 64)
   "tok_embeddings.weight": (
@@ -208,7 +209,7 @@ def main(config: DictConfig):
     tp = 4
     if len(jax.devices()) == 512:
       dev_array = custom_mesh.create_custom_64x4_device_mesh(
-        (64, 4), (2, 1), jax.devices()
+        (64, tp), (2, 1), jax.devices()
       )
     else:
       assert len(jax.devices()) == 256
 
@@ -43,7 +43,8 @@
   [
     os.getenv("XLA_FLAGS", ""),
     f"--xla_dump_to={xla_dump_path}/",
-    "--xla_dump_hlo_as_proto",
+    "--xla_dump_hlo_as_proto",  # Save HLO protobuf files
+    "--xla_dump_hlo_as_text",  # Save HLO text files
   ]
 )
 print(f"Dumping XLA compiler outputs to {xla_dump_path}", flush=True)
 
@@ -0,0 +1,152 @@
+"""
+`custom_mesh` implements virtual device meshes with better performance than
+the default device mesh generated by torch_xla or torchax.
+"""
+
+import collections
+import dataclasses
+from collections.abc import Sequence
+from typing import Any
+
+import numpy as np
+from torch.utils._pytree import tree_map
+
+
+def maybe_get_custom_mesh(
+  ici_mesh_shape: Sequence[int],
+  dcn_mesh_shape: Sequence[int],
+  num_devices: int,
+  num_slices: int,
+) -> np.ndarray | None:
+  """
+  Get a more performant custom mesh given the mesh shape if applicable.
+
+  The dimensions in mesh shapes should be ordered from least communication intensive
+  to most communication intensive.
+  """
+  non_trivial_ici_mesh_shape = list(ici_mesh_shape)
+  while non_trivial_ici_mesh_shape:
+    if non_trivial_ici_mesh_shape[-1] == 1:
+      non_trivial_ici_mesh_shape.pop()
+    else:
+      break
+
+  # Pattern matching for 64x4 custom mesh inside a granule.
+  # When there exists a 4 chip group that is more communication intensive
+  # (e.g. tensor parallelism), we should reshape those groups of 4 devices
+  # into a ring to improve collectives performance.
+  if (
+    len(non_trivial_ici_mesh_shape) >= 2
+    and non_trivial_ici_mesh_shape[-1] == 4
+    and non_trivial_ici_mesh_shape[-2] == 64
+  ):
+    return get_64x4_hybrid_ring_mesh(
+      ici_mesh_shape=non_trivial_ici_mesh_shape,
+      dcn_mesh_shape=dcn_mesh_shape,
+      num_devices=num_devices,
+      num_slices=num_slices,
+    )
+  return None
+
+
+def create_custom_64x4_device_mesh(
+  mesh_shape: Sequence[int],
+  dcn_mesh_shape: Sequence[int],
+  devices: Sequence[Any],
+) -> np.ndarray:
+  """
+  Custom device mesh for 64x4 ICI parallelism.
+
+  Arranges every group of 4 devices into a ring, to improve collectives performance for those groups
+  of 4 devices.
+
+  This function is a simplified variation of [1].
+
+  [1]: https://github.com/jax-ml/jax/blame/1079dc4477d41fd25397c8d0b78a32bdc5fa48da/jax/_src/mesh_utils.py#L790
+  """
+
+  from jax.experimental import mesh_utils
+
+  assert (
+    len(devices) % 256 == 0
+  ), f"This custom mesh is not valid for {len(devices)} devices"
+  attr = "slice_index"
+  if not hasattr(devices[0], attr):
+    raise ValueError(
+      f"Device {devices[0]} does not have attribute {attr}. See"
+      " `process_is_granule` option."
+    )
+  granule_dict = collections.defaultdict(list)
+  for dev in devices:
+    granule_dict[getattr(dev, attr)].append(dev)
+  granules = [granule_dict[key] for key in sorted(granule_dict.keys())]
+  if np.prod(dcn_mesh_shape) != len(granules):
+    raise ValueError(
+      f"Number of slices {len(granules)} must equal the product of "
+      f"dcn_mesh_shape {dcn_mesh_shape}"
+    )
+  per_granule_meshes = [
+    mesh_utils.create_device_mesh(
+      [16, 16],
+      granule,
+      allow_split_physical_axes=False,
+    )
+    for granule in granules
+  ]
+
+  def reshape_mesh_to_rings(a):
+    b = []
+    for i in range(8):
+      b.append([])
+      for j in range(8):
+        a_i = i * 2
+        a_j = j * 2
+        # forms a ring of size 4
+        b[i].append(
+          [
+            a[a_i, a_j],
+            a[a_i, a_j + 1],
+            a[a_i + 1, a_j + 1],
+            a[a_i + 1, a_j],
+          ]
+        )
+    b = np.array(b)
+    b = np.reshape(b, (64, 4))
+    return b
+
+  per_granule_meshes = [
+    np.reshape(reshape_mesh_to_rings(x), mesh_shape) for x in per_granule_meshes
+  ]
+  granule_mesh = np.arange(len(granules)).reshape(dcn_mesh_shape)
+  blocks = np.vectorize(lambda i: per_granule_meshes[i], otypes=[object])(granule_mesh)
+  device_mesh = np.block(blocks.tolist())
+  return device_mesh
+
+
+@dataclasses.dataclass
+class Device:
+  process_index: int
+  slice_index: int
+  uid: int
+  device_kind: str = ""
+  platform: str = "cpu"
+
+
+def get_64x4_hybrid_ring_mesh(
+  ici_mesh_shape: Sequence[int],
+  dcn_mesh_shape: Sequence[int],
+  num_devices: int,
+  num_slices: int,
+) -> np.ndarray:
+  num_devices_per_granule = num_devices // num_slices
+  devices = [
+    Device(i // num_devices_per_granule, i // num_devices_per_granule, i)
+    for i in range(num_devices)
+  ]
+  devices = (
+    create_custom_64x4_device_mesh(ici_mesh_shape, dcn_mesh_shape, devices)
+    .reshape(-1)
+    .tolist()
+  )
+  devices = np.array(tree_map(lambda d: d.uid, devices))
+  return devices
Original file line number	Diff line number	Diff line change
`@@ -43,7 +43,8 @@`
`43`	`43`	`[`
`44`	`44`	`os.getenv("XLA_FLAGS", ""),`
`45`	`45`	`f"--xla_dump_to={xla_dump_path}/",`
`46`		`- "--xla_dump_hlo_as_proto",`
	`46`	`+ "--xla_dump_hlo_as_proto", # Save HLO protobuf files`
	`47`	`+ "--xla_dump_hlo_as_text", # Save HLO text files`
`47`	`48`	`]`
`48`	`49`	`)`
`49`	`50`	`print(f"Dumping XLA compiler outputs to {xla_dump_path}", flush=True)`