Add autograd function for mark_sharding (#8723)

bhavya01 · web-flow · commit 1acc987aec54 · 2025-02-20T10:34:34.000-08:00
diff --git a/test/spmd/test_xla_sharding.py b/test/spmd/test_xla_sharding.py
@@ -17,6 +17,7 @@
 import torch_xla.debug.metrics as met
 import torch_xla.distributed.spmd as xs
 from torch_xla.distributed.spmd import XLAShardedTensor
+from torch_xla.distributed.spmd.xla_sharding import MarkShardingFunction
 import torch_xla.distributed.parallel_loader as pl
 import test_xla_sharding_base
 
@@ -835,6 +836,23 @@ def test_mark_sharding_ir(self):
 
     self.assertTrue(torch.allclose(expected, actual.cpu()))
 
+  @unittest.skipUnless(xr.global_runtime_device_count() > 1,
+                       "Multiple devices required for autograd sharding test")
+  def test_mark_sharding_autograd(self):
+    x = torch.randn(8, 8, requires_grad=True)
+    x = x.to('xla')
+    mesh = self._get_mesh((1, self.n_devices))
+    # Forward pass
+    z = x @ x
+    z.retain_grad()  # To be able to extract HLO from intermediate tensor grad.
+    y = MarkShardingFunction.apply(z, mesh, (0, 1))
+    t = y.sum()
+    # Backward pass
+    t.backward()
+    hlo = torch_xla._XLAC._get_xla_tensors_hlo([z.grad])
+    sharding_annotation = 'sharding={devices=[1,%d]' % self.n_devices
+    self.assertIn(sharding_annotation, hlo)
+
   def test_sharded_tensor_aliasing(self):
     met.clear_all()
     partition_spec = (0, 1)
diff --git a/torch_xla/distributed/spmd/__init__.py b/torch_xla/distributed/spmd/__init__.py
@@ -17,6 +17,7 @@
     "ShardingType",
     "ShardingSpec",
     "XLAPatchedLinear",
+    "MarkShardingFunction"
     "mark_sharding",
     "clear_sharding",
     "get_1d_mesh",
diff --git a/torch_xla/distributed/spmd/xla_sharding.py b/torch_xla/distributed/spmd/xla_sharding.py
@@ -1123,3 +1123,32 @@ def _generate_logical_mesh(
                             logical_mesh_shape)  # type: ignore  # numpy 2.2
 
   return logical_mesh
+
+
+class MarkShardingFunction(torch.autograd.Function):
+  """
+  Autograd function to mark_sharding on intermediate tensors and the gradient
+  of the intermediate tensors during backward pass.
+
+  Usage:
+  new_tensor = MarkShardingFunction.apply(tensor, mesh, ('axis_1', 'axis_2'))
+
+  This is required to guide GSPMD sharding propagation better during the 
+  backward pass as during complicated workloads the compiler can introduce extra 
+  collectives that can hurt performance.
+  """
+
+  @staticmethod
+  def forward(ctx, torch_tensor: torch.Tensor, mesh: Mesh,
+              partition_spec: Tuple) -> torch.Tensor:
+    mark_sharding(torch_tensor, mesh, partition_spec)
+    ctx.partition_spec = partition_spec
+    ctx.mesh = mesh
+    return torch_tensor
+
+  @staticmethod
+  def backward(ctx, grad_output: torch.Tensor) -> torch.Tensor:
+    partition_spec = ctx.partition_spec
+    mesh = ctx.mesh
+    mark_sharding(grad_output, mesh, partition_spec)
+    return grad_output, None, None