Use shard_as in scan to ensure that inputs and their gradients have the same sharding (#8879)

tengyifei · web-flow · commit 6d88c089bb13 · 2025-03-28T20:26:35.000-07:00
diff --git a/.circleci/common.sh b/.circleci/common.sh
@@ -116,6 +116,7 @@ function install_post_deps_pytorch_xla() {
   -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html \
   -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
 
+  # TODO(https://github.com/pytorch/xla/issues/8831): Remove this when torchax is part of torch_xla.
   pip install xla/torchax
 }
 
diff --git a/.github/workflows/_test.yml b/.github/workflows/_test.yml
@@ -133,6 +133,7 @@ jobs:
             -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
           
           # Install torchax
+          # TODO(https://github.com/pytorch/xla/issues/8831): Remove this when torchax is part of torch_xla.
           pip install pytorch/xla/torchax
 
           if [[ ! -z "$RUN_BENCHMARK_TESTS" ]]; then
diff --git a/.github/workflows/_tpu_ci.yml b/.github/workflows/_tpu_ci.yml
@@ -27,6 +27,10 @@ jobs:
           pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
           pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-wheels/index.html -f https://storage.googleapis.com/libtpu-releases/index.html
           pip install --upgrade protobuf
+
+          # torchax is needed for call_jax tests.
+          # TODO(https://github.com/pytorch/xla/issues/8831): Remove this when torchax is part of torch_xla.
+          pip install pytorch/xla/torchax
       - name: Run Tests
         env:
           PJRT_DEVICE: TPU
diff --git a/test/scan/test_scan.py b/test/scan/test_scan.py
@@ -475,7 +475,9 @@ def unpack(x):
 
     # Find the input that is stored in the context object.
     stored_xs = None
-    for s in storage:
+    # Dedupe the tensors because the autograd context may save the same tensor twice.
+    # Saving a tensor twice won't use extra storage though thanks to ref-counting.
+    for s in set(storage):
       if s.shape == xs.shape:
         assert stored_xs is None
         stored_xs = s
diff --git a/test/scan/test_scan_spmd.py b/test/scan/test_scan_spmd.py
@@ -1,15 +1,15 @@
-from copy import deepcopy
 import sys
 import re
 import unittest
 
+import numpy as np
 import torch
 import torch_xla
 import torch.nn as nn
-from torch_xla.distributed.spmd.xla_sharding import apply_xla_patch_to_nn_linear
+from torch_xla.distributed.spmd.xla_sharding import apply_xla_patch_to_nn_linear, Mesh
 from torch_xla.experimental.scan import scan
 from torch_xla.experimental.scan_layers import scan_layers
-from torch_xla.distributed.spmd import mark_sharding, set_global_mesh, get_1d_mesh
+from torch_xla.distributed.spmd import mark_sharding, mark_sharding_with_gradients, set_global_mesh, get_1d_mesh, get_global_mesh
 import torch_xla.runtime as xr
 
 
@@ -59,6 +59,98 @@ def fn(carry, x):
                     f'devices=[1,{N}]0,',
                     torch_xla._XLAC._get_xla_tensor_debug_info(tensor))
 
+  @unittest.skipUnless(xr.global_runtime_device_count() >= 4,
+                       "Multiple devices required")
+  def test_scan_2d_sharding(self):
+    """
+    Test the sharding propagation of gradients when scanning 2D sharded layers.
+
+    Specifically, we scan over a group of simple MLP blocks found in transformers.
+
+    Inputs:
+      A: [B_x, S, H_y]
+      W1: [F_y, H_x]
+      W2: [H_x, F_y]
+
+    Outputs:
+      B: [B_x, S, H_y]
+
+    B = A @ W1.T @ W2.T
+
+    Under 2D sharding, the gradient of loss w.r.t. (A @ W1.T) is 2D sharded.
+    A is also 2D sharded. GSPMD need to figure out that the gradient of loss w.r.t.
+    W1 should also be 2D sharded.
+    """
+
+    mesh_shape = (2, xr.global_runtime_device_count() // 2)
+    mesh_axis_names = ("fsdp", "tensor")
+    mesh = Mesh(
+        np.arange(xr.global_runtime_device_count()), mesh_shape,
+        mesh_axis_names)
+
+    class MLPBlock(nn.Module):
+
+      def __init__(self):
+        super().__init__()
+        self.up_proj = nn.Linear(128, 256, bias=False)
+        self.down_proj = nn.Linear(256, 128, bias=False)
+
+      def forward(self, hidden_states):
+        hidden_states = mark_sharding_with_gradients(hidden_states, mesh,
+                                                     ("fsdp", None, "tensor"))
+        hidden_states = self.up_proj(hidden_states)
+        hidden_states = mark_sharding_with_gradients(hidden_states, mesh,
+                                                     ("fsdp", None, "tensor"))
+        hidden_states = torch.sin(hidden_states)
+        hidden_states = mark_sharding_with_gradients(hidden_states, mesh,
+                                                     ("fsdp", None, "tensor"))
+        hidden_states = self.down_proj(hidden_states)
+        hidden_states = mark_sharding_with_gradients(hidden_states, mesh,
+                                                     ("fsdp", None, "tensor"))
+        return hidden_states
+
+    class MyModel(nn.Module):
+
+      def __init__(self):
+        super().__init__()
+        self.layers = nn.Sequential(*[MLPBlock() for _ in range(4)])
+
+      def forward(self, hidden_states: torch.Tensor):
+        hidden_states = mark_sharding_with_gradients(hidden_states, mesh,
+                                                     ("fsdp", None, "tensor"))
+        return scan_layers(self.layers, hidden_states)
+
+    torch.manual_seed(42)
+    torch_xla.manual_seed(42)
+    model = MyModel().to('xla')
+    model = apply_xla_patch_to_nn_linear(model)
+    for name, param in model.named_parameters():
+      if 'up_proj' in name:
+        mark_sharding(param, mesh, ("tensor", "fsdp"))
+      if 'down_proj' in name:
+        mark_sharding(param, mesh, ("fsdp", "tensor"))
+
+    # Batch, Seq, Hidden
+    hidden_states = torch.randn((3, 50, 128), device='xla')
+    torch_xla.sync()
+
+    # Run the model
+    model.zero_grad()
+    out = model(hidden_states)
+    # Prepare to check the gradient of W1
+    for layer in model.layers.children():  # type: ignore
+      layer.up_proj.weight.retain_grad()  # type: ignore
+    out.sum().backward()
+    torch_xla.sync(wait=True)
+    # Check the gradient of W1
+    for layer in model.layers.children():  # type: ignore
+      # Right: {devices=[2,2]0,2,1,3}, {devices=[4,2]0,4,1,5,2,6,3,7} or similar
+      # Wrong: {devices=[2,1,2]0,2,1,3 last_tile_dim_replicate} or similar
+      sharding_spec = torch_xla._XLAC._get_xla_sharding_spec(
+          layer.up_proj.weight.grad)  # type: ignore
+      self.assertIn(f'devices=[{mesh_shape[1]},2]0', sharding_spec)
+      self.assertNotIn('last_tile_dim_replicate', sharding_spec)
+
   @unittest.skipUnless(xr.global_runtime_device_count() >= 4,
                        "Multiple devices required")
   def test_scan_xla_patched_linear(self):
diff --git a/test/spmd/test_xla_sharding.py b/test/spmd/test_xla_sharding.py
@@ -1660,6 +1660,29 @@ def test_get_logical_mesh(self):
     self.assertEqual(logical_mesh.shape, mesh_shape)
     np.testing.assert_array_equal(np.sort(logical_mesh.flatten()), device_ids)
 
+  @unittest.skipIf(xr.device_type() == 'CPU',
+                   "sharding will be the same for both tensors on single device"
+                  )
+  def test_shard_as(self):
+    mesh = self._get_mesh((self.n_devices,))
+    partition_spec = (0,)
+    x = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8],
+                     dtype=torch.float,
+                     device=xm.xla_device())
+    x = xs.mark_sharding_with_gradients(x, mesh, partition_spec)
+    y = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8],
+                     dtype=torch.float,
+                     device=xm.xla_device())
+
+    x, y = xs.shard_as(x, y)
+    torch_xla.sync()
+
+    sharding_spec = '{devices=[%d]' % self.n_devices
+    x_sharding = torch_xla._XLAC._get_xla_sharding_spec(x)
+    y_sharding = torch_xla._XLAC._get_xla_sharding_spec(y)
+    self.assertIn(sharding_spec, x_sharding)
+    self.assertEqual(x_sharding, y_sharding)
+
 
 if __name__ == '__main__':
   test = unittest.main()
diff --git a/torch_xla/_internal/jax_workarounds.py b/torch_xla/_internal/jax_workarounds.py
@@ -0,0 +1,44 @@
+import os
+from contextlib import contextmanager
+from typing import Callable, Any
+import functools
+
+
+# TODO(https://github.com/pytorch/xla/issues/8793): Get rid of this hack.
+def jax_import_guard():
+  import torch_xla
+  # Somehow, we need to grab the TPU before JAX locks it. Otherwise, any pt-xla TPU operations will hang.
+  torch_xla._XLAC._init_computation_client()
+
+
+# TODO(https://github.com/pytorch/xla/issues/8793): Get rid of this hack.
+def requires_jax(func: Callable) -> Callable:
+  """Decorator that ensures JAX is safely imported before function execution"""
+
+  @functools.wraps(func)
+  def wrapper(*args, **kwargs) -> Any:
+    try:
+      jax_import_guard()
+    except ImportError as e:
+      raise ImportError(
+          "JAX import guard fail due to PJRT client is unavailable.") from e
+    with jax_env_context():
+      return func(*args, **kwargs)
+
+  return wrapper
+
+
+# TODO(b/374631442): Get rid of this hack that worksaround MegaScale hanging.
+@contextmanager
+def jax_env_context():
+  previous_skip_megascale_env = None
+  try:
+    previous_skip_megascale_env = os.environ.get('SKIP_MEGASCALE_PJRT_CLIENT',
+                                                 None)
+    os.environ['SKIP_MEGASCALE_PJRT_CLIENT'] = 'true'
+    yield
+  finally:
+    if previous_skip_megascale_env:
+      os.environ['SKIP_MEGASCALE_PJRT_CLIENT'] = previous_skip_megascale_env
+    else:
+      os.environ.pop('SKIP_MEGASCALE_PJRT_CLIENT', None)
diff --git a/torch_xla/core/xla_builder.py b/torch_xla/core/xla_builder.py
@@ -3,8 +3,8 @@
 from weakref import WeakKeyDictionary
 import torch
 import torch_xla
-from torch.utils._pytree import tree_flatten, tree_unflatten
-from torch_xla.experimental.custom_kernel import _jax_env_context, jax_import_guard
+from torch.utils._pytree import tree_flatten
+from torch_xla._internal.jax_workarounds import jax_env_context, jax_import_guard
 
 
 class Type:
@@ -862,7 +862,7 @@ def jax_func_to_xla_computation(jax_func, args, kwargs, name=None):
 
   # Prevent JAX from discovering MegaScale devices a second time. If we don't do this,
   # then the MegaScale device discovery will hang.
-  with _jax_env_context():
+  with jax_env_context():
     import jax
     import torchax.ops.mappings as mappings
 
diff --git a/torch_xla/distributed/spmd/__init__.py b/torch_xla/distributed/spmd/__init__.py
@@ -4,7 +4,7 @@
     mark_sharding, mark_sharding_with_gradients, clear_sharding, get_1d_mesh,
     wrap_if_sharded, xla_patched_nn_linear_forward, set_global_mesh,
     get_global_mesh, _mark_manual_sharding, enable_manual_sharding,
-    disable_manual_sharding, apply_backward_optimization_barrier)
+    disable_manual_sharding, apply_backward_optimization_barrier, shard_as)
 from .api import xla_distribute_tensor, xla_distribute_module, auto_policy
 
 __all__ = [
@@ -19,6 +19,7 @@
     "MarkShardingFunction"
     "mark_sharding",
     "mark_sharding_with_gradients",
+    "shard_as",
     "clear_sharding",
     "get_1d_mesh",
     "wrap_if_sharded",
diff --git a/torch_xla/distributed/spmd/xla_sharding.py b/torch_xla/distributed/spmd/xla_sharding.py
@@ -7,19 +7,23 @@
 from torch import Tensor
 from torch.library import custom_op
 import torch_xla
+import torch_xla.core.xla_builder as xb
 import torch_xla.core.xla_model as xm
 import torch_xla._internal.utils as _utils
 from torch_xla.distributed.spmd import XLAShardedTensor, XLAShard
 import torch_xla.runtime as xr
 import torch_xla.debug.profiler as xp
+from torch_xla._internal.jax_workarounds import requires_jax
 
 import numpy as np
 import functools
 import itertools
-from typing import Union, Sequence, Any, Optional
+from typing import TypeVar, Union, Any, Optional
+from collections.abc import Sequence
 from enum import IntEnum
 
 from torch.amp import custom_fwd, custom_bwd
+from torch.utils._pytree import tree_flatten, tree_unflatten
 
 PartitionSpec = tuple[Union[tuple[Union[int, str], ...], int, str, None], ...]
 """PartitionSpec describes the sharding of a tensor.
@@ -574,7 +578,7 @@ def mark_sharding(t: Union[torch.Tensor, XLAShardedTensor], mesh: Mesh,
       >>> xs.mark_sharding(linear.weight, mesh, (None, 1)) # 2-way model parallel
   """
   # We only allow fully specified `partition_spec` to be applicable, as opposed
-  # to filling in the unspecified replicated dims. Fully specified `partiion_spec`
+  # to filling in the unspecified replicated dims. Fully specified `partition_spec`
   # should be of the same rank as `t`. This is to support partial replication
   # where the group assignment may vary with different input ranks.
   assert len(t.shape) == len(partition_spec), \
@@ -588,8 +592,7 @@ def mark_sharding(t: Union[torch.Tensor, XLAShardedTensor], mesh: Mesh,
 
 def mark_sharding_with_gradients(
     t: Union[torch.Tensor, XLAShardedTensor], mesh: Mesh,
-    partition_spec: tuple[Union[tuple, int, str, None],
-                          ...]) -> XLAShardedTensor:
+    partition_spec: tuple[Union[tuple, int, str, None], ...]) -> torch.Tensor:
   """
     A function to add sharding annotations on intermediate tensors (not in-place) and the gradient
     of the intermediate tensors during backward pass.
@@ -618,13 +621,48 @@ def mark_sharding_with_gradients(
     This version can also be used in AOTAutograd.
     """
   # We only allow fully specified `partition_spec` to be applicable, as opposed
-  # to filling in the unspecified replicated dims. Fully specified `partiion_spec`
+  # to filling in the unspecified replicated dims. Fully specified `partition_spec`
   # should be of the same rank as `t`. This is to support partial replication
   # where the group assignment may vary with different input ranks.
   assert len(t.shape) == len(partition_spec), \
     f"Partition spec length ({len(partition_spec)}) should be equal to the input rank ({len(t.shape)})."
 
-  return MarkShardingFunction.apply(t, mesh, partition_spec)
+  r = MarkShardingFunction.apply(t, mesh, partition_spec)
+  assert isinstance(r, torch.Tensor)
+  return r
+
+
+PyTreeA = TypeVar('PyTreeA')
+PyTreeB = TypeVar('PyTreeB')
+
+
+@requires_jax
+def shard_as(a: PyTreeA, b: PyTreeB) -> tuple[PyTreeA, PyTreeB]:
+  """Ensure that `a` and `b` are sharded the same way without specifying
+  a particular sharding constraint.
+
+  shard_as takes two PyTrees of matching structure and returns
+  two PyTrees of that same structure. As long as you use at least one
+  of the outputs, then corresponding tensors in all four PyTrees
+  (a, b, out[0], out[1]) will be sharded the same way.
+  """
+
+  a_flat, a_spec = tree_flatten(a)
+  b_flat, b_spec = tree_flatten(b)
+  assert a_spec == b_spec, f"a and b must have the same structure. got {a_spec} and {b_spec}"
+  a_sharded_flat = []
+  b_sharded_flat = []
+  from jax.experimental.shard_alike import shard_alike
+  for x, y in zip(a_flat, b_flat):
+    if x is None or y is None:
+      # If there are None leaves, then it should be None in both PyTrees.
+      assert x is None and y is None
+    else:
+      x, y = xb.call_jax(shard_alike, (x, y))
+    a_sharded_flat.append(x)
+    b_sharded_flat.append(y)
+  return tree_unflatten(a_sharded_flat,
+                        a_spec), tree_unflatten(b_sharded_flat, b_spec)
 
 
 def clear_sharding(t: Union[torch.Tensor, XLAShardedTensor]) -> torch.Tensor:
diff --git a/torch_xla/experimental/custom_kernel.py b/torch_xla/experimental/custom_kernel.py
diff --git a/torch_xla/experimental/scan.py b/torch_xla/experimental/scan.py

Original file line number	Diff line number	Diff line change
`@@ -116,6 +116,7 @@ function install_post_deps_pytorch_xla() {`
`116`	`116`	`-f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html \`
`117`	`117`	`-f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html`
`118`	`118`
	`119`	`+ # TODO(https://github.com/pytorch/xla/issues/8831): Remove this when torchax is part of torch_xla.`
`119`	`120`	`pip install xla/torchax`
`120`	`121`	`}`
`121`	`122`