Shard both forward and backwards (#144)

tengyifei · web-flow · commit 7d19eb5272d7 · 2025-03-10T10:39:58.000-07:00
We replace `xs.mark_sharding` with `MarkShardingFunction` which shards
gradient too. This should make GSPMD much more robust.
diff --git a/torchprime/sharding/shard_model.py b/torchprime/sharding/shard_model.py
@@ -211,14 +211,31 @@ def shard_torch_xla_model_from_config(
   If `mesh` is not given, there must be a registered global mesh.
   """
   import torch_xla.distributed.spmd as xs
+  from torch_xla.distributed.spmd.xla_sharding import MarkShardingFunction
 
-  def shard_fn(tensor, spec: tuple[str, ...]):
+  def shard_activation(tensor, spec: tuple[str, ...]):
+    the_mesh = mesh if mesh is not None else xs.get_global_mesh()
+    assert the_mesh is not None, "No mesh found"
+    # TODO(https://github.com/pytorch/xla/issues/8678): Replace with the simpler
+    # `mark_sharding_and_gradients`.
+    out = MarkShardingFunction.apply(tensor, the_mesh, spec)
+    assert isinstance(out, torch.Tensor)
+    return out
+
+  # TODO(https://github.com/pytorch/xla/issues/8809): If we shard parameters with
+  # `MarkShardingFunction.apply`, that causes Mixtral to OOM. Gradient HLO arrays end up
+  # living much longer than needed.
+  def shard_param(tensor, spec: tuple[str, ...]):
     the_mesh = mesh if mesh is not None else xs.get_global_mesh()
     assert the_mesh is not None, "No mesh found"
-    # TODO(https://github.com/pytorch/xla/issues/8678): Shard the gradient too.
     return xs.mark_sharding(tensor, the_mesh, spec).global_tensor
 
-  return shard_model_from_config(model, config, shard_fn)
+  return shard_model_from_config(
+    model,
+    config,
+    shard_activation,
+    shard_param,
+  )
 
 
 def _process_tail_index_syntax(