pytorch
diff --git a/‎setup.py
Lines changed: 4 additions & 4 deletions b/‎setup.py
Lines changed: 4 additions & 4 deletions
diff --git a/‎test/test_mp_all_gather.py
Lines changed: 1 addition & 0 deletions b/‎test/test_mp_all_gather.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎test/test_mp_collective_matmul.py
Lines changed: 56 additions & 0 deletions b/‎test/test_mp_collective_matmul.py
Lines changed: 56 additions & 0 deletions
diff --git a/‎test/tpu/run_tests.sh
Lines changed: 1 addition & 0 deletions b/‎test/tpu/run_tests.sh
Lines changed: 1 addition & 0 deletions
diff --git a/‎torch_xla/_internal/tpu.py
Lines changed: 16 additions & 0 deletions b/‎torch_xla/_internal/tpu.py
Lines changed: 16 additions & 0 deletions
diff --git a/‎torch_xla/core/xla_model.py
Lines changed: 15 additions & 5 deletions b/‎torch_xla/core/xla_model.py
Lines changed: 15 additions & 5 deletions
diff --git a/‎torch_xla/csrc/cross_replica_reduces.cpp
Lines changed: 29 additions & 8 deletions b/‎torch_xla/csrc/cross_replica_reduces.cpp
Lines changed: 29 additions & 8 deletions
diff --git a/‎torch_xla/csrc/cross_replica_reduces.h
Lines changed: 8 additions & 5 deletions b/‎torch_xla/csrc/cross_replica_reduces.h
Lines changed: 8 additions & 5 deletions
diff --git a/‎torch_xla/csrc/init_python_bindings.cpp
Lines changed: 31 additions & 22 deletions b/‎torch_xla/csrc/init_python_bindings.cpp
Lines changed: 31 additions & 22 deletions
@@ -66,10 +66,10 @@
 
 USE_NIGHTLY = True  # whether to use nightly or stable libtpu and jax
 
-_date = '20250303'
-_libtpu_version = '0.0.11'
-_jax_version = '0.5.2'
-_jaxlib_version = '0.5.2'
+_date = '20250320'
+_libtpu_version = '0.0.12'
+_jax_version = '0.5.4'
+_jaxlib_version = '0.5.4'
 
 _libtpu_wheel_name = f'libtpu-{_libtpu_version}'
 _libtpu_storage_directory = 'libtpu-lts-releases'
 
@@ -18,6 +18,7 @@ def _mp_fn(index):
     # Testing with a single replica group
     ordinal_tensor = torch.tensor([index], dtype=torch.float).to(device)
     result = xm.all_gather(ordinal_tensor, dim=0)
+    xm.mark_step()
 
     cpu_result = result.cpu()
     expected = torch.arange(0, world_size, dtype=torch.float)
 
@@ -0,0 +1,56 @@
+import os
+import sys
+import torch
+import torch_xla
+from torch_xla import runtime as xr
+import torch_xla.core.xla_model as xm
+
+
+def _mp_fn(index):
+  os.environ["ENABLE_COLLECTIVE_MATMUL_IN_MP"] = "1"
+  device = xm.xla_device()
+  world_size = xr.world_size()
+  groups = [[i for i in range(world_size)]]
+  scale = 1 / world_size
+  scatter_dim = 1
+  shard_size = 2
+
+  if xm.xla_device_hw(device) in ('TPU',):
+    # Testing with a single replica group, channel_id and use_global_device_ids
+    ordinal_tensor = torch.tensor([index], dtype=torch.float).to(device)
+    result = xm.all_gather(
+        ordinal_tensor,
+        dim=0,
+        groups=groups,
+        channel_id=1,
+        use_global_device_ids=True)
+    xm.mark_step()
+
+    cpu_result = result.cpu()
+    expected = torch.arange(0, world_size, dtype=torch.float)
+    assert cpu_result.allclose(expected)
+
+    rand = torch.rand((32, shard_size * world_size, 32))
+    xrand = rand.to(device)
+
+    res = xm.reduce_scatter(
+        xm.REDUCE_SUM,
+        xrand,
+        scale,
+        scatter_dim,
+        world_size,
+        groups=groups,
+        channel_id=1,
+        use_global_device_ids=True)
+    expected_world = xm.all_reduce(xm.REDUCE_SUM, xrand, scale)
+    xm.mark_step()
+
+    slice_idx = torch.tensor(
+        list(range(index * shard_size, (index + 1) * shard_size)))
+    expected = expected_world.cpu().index_select(scatter_dim, slice_idx)
+
+    assert res.cpu().allclose(expected)
+
+
+if __name__ == '__main__':
+  torch_xla.launch(_mp_fn, args=())
@@ -10,6 +10,7 @@ python3 "$TEST_CDIR/test_operations.py" -v
 python3 "$TEST_CDIR/pjrt/test_runtime_tpu.py"
 python3 "$TEST_CDIR/pjrt/test_collective_ops_tpu.py"
 python3 "$TEST_CDIR/spmd/test_mp_input_sharding.py"
+python3 "$TEST_CDIR/test_mp_collective_matmul.py"
 run_save_tensor_hlo python3 "$TEST_CDIR/spmd/test_spmd_lowering_context.py"
 python3 "$TEST_CDIR/spmd/test_xla_sharding.py"
 python3 "$TEST_CDIR/spmd/test_xla_virtual_device.py"
 
@@ -192,6 +192,22 @@ def version() -> int:
   return int(match.groups()[0])
 
 
+def get_tpu_type() -> str:
+  """
+  Return the tpu type. E.g. "v6e-8" returns "v6e"
+  """
+  try:
+    env = get_tpu_env()
+  except requests.HTTPError as e:
+    raise EnvironmentError('Failed to get TPU metadata') from e
+
+  match = re.search(r"^([^-]*)-", env[xenv.ACCELERATOR_TYPE])
+  if match:
+    return match.group(1)
+  else:
+    return env[xenv.ACCELERATOR_TYPE]
+
+
 def get_worker_ips() -> List[str]:
   """Returns ordered list of TPU worker IPs from TPU metadata."""
   if _using_env_vars():
 
@@ -20,6 +20,7 @@
 import torch_xla.utils.utils as xu
 import torch_xla.utils.closures as xc
 from torch_xla.distributed.spmd.xla_sharding import ShardingSpec
+from torch_xla.distributed.xla_multiprocessing import create_optimized_replica_groups
 import os
 from torch_xla.experimental.deprecation import deprecated
 import torch_xla._internal.utils as _utils
@@ -532,7 +533,9 @@ def all_gather(value: torch.Tensor,
                dim: int = 0,
                groups: Optional[List[List[int]]] = None,
                output: Optional[torch.Tensor] = None,
-               pin_layout: bool = True) -> torch.Tensor:
+               pin_layout: bool = True,
+               channel_id=None,
+               use_global_device_ids=None) -> torch.Tensor:
   """Performs an all-gather operation along a given dimension.
 
   Args:
@@ -550,7 +553,8 @@ def all_gather(value: torch.Tensor,
       participate in the communication has slightly different program, but it might
       cause some xla compilation to fail. Unpin the layout when you see error message
       like "HloModule has a mix of layout constrained".
-
+    channel_id (int, optional): Optional channel ID for cross-module communication
+    use_global_device_ids(bool, optional): If true, interprets ids in ReplicaGroup as global device ids
   Returns:
     A tensor which has, in the ``dim`` dimension, all the values from the
     participating replicas.
@@ -584,7 +588,8 @@ def all_gather(value: torch.Tensor,
       return output
 
     result = torch_xla._XLAC._xla_all_gather(value, dim, shard_count, groups or
-                                             [], pin_layout)
+                                             [], pin_layout, channel_id,
+                                             use_global_device_ids)
     return result
 
   # Now the input should be a list of Tensors.
@@ -870,7 +875,9 @@ def reduce_scatter(reduce_type: str,
                    groups: Optional[List[List[int]]] = None,
                    output: Optional[Union[torch.Tensor,
                                           List[torch.Tensor]]] = None,
-                   pin_layout: bool = True) -> torch.Tensor:
+                   pin_layout: bool = True,
+                   channel_id=None,
+                   use_global_device_ids=None) -> torch.Tensor:
   """Performs a XLA `ReduceScatter()` operation on the input tensor.
 
   See: https://www.tensorflow.org/xla/operation_semantics#reducescatter
@@ -896,6 +903,8 @@ def reduce_scatter(reduce_type: str,
       participate in the communication has slightly different program, but it might
       cause some xla compilation to fail. Unpin the layout when you see error message
       like "HloModule has a mix of layout constrained".
+    channel_id (int, optional): Optional channel ID for cross-module communication
+    use_global_device_ids(bool, optional): If true, interprets ids in ReplicaGroup as global device ids
 
   Returns:
     A `torch.Tensor` with all the values reduced across replicas. Each process
@@ -916,7 +925,8 @@ def reduce_scatter(reduce_type: str,
     result = torch_xla._XLAC._xla_reduce_scatter(reduce_type, input, token,
                                                  scale, scatter_dim,
                                                  shard_count, groups or [],
-                                                 pin_layout)
+                                                 pin_layout, channel_id,
+                                                 use_global_device_ids)
     torch_xla._XLAC._set_all_reduce_token(devctx.device, result[1])
     return result[0]
 
 
@@ -232,10 +232,20 @@ AllToAllResult BuildAllToAll(xla::XlaOp input, xla::XlaOp token,
 AllGatherResult BuildAllGather(xla::XlaOp input, xla::XlaOp token, int64_t dim,
                                int64_t shard_count,
                                const std::vector<std::vector<int64_t>>& groups,
-                               bool pin_layout) {
+                               bool pin_layout,
+                               std::optional<int64_t> channel_id,
+                               std::optional<bool> use_global_device_ids) {
   std::vector<xla::ReplicaGroup> reduce_groups = CreateReduceGroups(groups);
   const xla::Shape& input_shape = ShapeHelper::ShapeOfXlaOp(input);
   TokenHandler token_handler(token);
+  std::optional<xla::ChannelHandle> channel_handle = std::nullopt;
+  if (channel_id.has_value()) {
+    xla::ChannelHandle channel_handle_value;
+    channel_handle_value.set_type(xla::ChannelHandle::DEVICE_TO_DEVICE);
+    channel_handle_value.set_handle(channel_id.value());
+    channel_handle = channel_handle_value;
+  }
+
   xla::XlaOp all_gather_result;
   if (pin_layout) {
     torch::lazy::BackendDevice xla_device = bridge::GetCurrentDevice();
@@ -245,12 +255,13 @@ AllGatherResult BuildAllGather(xla::XlaOp input, xla::XlaOp token, int64_t dim,
         static_cast<XlaDeviceType>(xla_device.type()));
     all_gather_result =
         xla::AllGather(token_handler.GetInput(input, &input_shape), dim,
-                       shard_count, reduce_groups, /*channel_id=*/absl::nullopt,
-                       /*layout=*/reduce_shape.layout());
+                       shard_count, reduce_groups, channel_handle,
+                       /*layout=*/reduce_shape.layout(), use_global_device_ids);
   } else {
     all_gather_result =
         xla::AllGather(token_handler.GetInput(input, &input_shape), dim,
-                       shard_count, reduce_groups);
+                       shard_count, reduce_groups, channel_handle,
+                       /*layout=*/std::nullopt, use_global_device_ids);
   }
   return {all_gather_result, token_handler.GetNewToken(all_gather_result)};
 }
@@ -389,10 +400,19 @@ RecvResult BuildRecvWithToken(xla::XlaOp token, const xla::Shape& recv_shape,
 ReduceScatterResult BuildReduceScatter(
     AllReduceType reduce_type, xla::XlaOp input, xla::XlaOp token, double scale,
     int64_t scatter_dim, int64_t shard_count,
-    const std::vector<std::vector<int64_t>>& groups, bool pin_layout) {
+    const std::vector<std::vector<int64_t>>& groups, bool pin_layout,
+    std::optional<int64_t> channel_id,
+    std::optional<bool> use_global_device_ids) {
   std::vector<xla::ReplicaGroup> reduce_groups = CreateReduceGroups(groups);
   TokenHandler token_handler(token);
   const xla::Shape& input_shape = ShapeHelper::ShapeOfXlaOp(input);
+  std::optional<xla::ChannelHandle> channel_handle = std::nullopt;
+  if (channel_id.has_value()) {
+    xla::ChannelHandle channel_handle_value;
+    channel_handle_value.set_type(xla::ChannelHandle::DEVICE_TO_DEVICE);
+    channel_handle_value.set_handle(channel_id.value());
+    channel_handle = channel_handle_value;
+  }
   xla::XlaOp reduce_result;
   if (pin_layout) {
     torch::lazy::BackendDevice xla_device = bridge::GetCurrentDevice();
@@ -403,13 +423,14 @@ ReduceScatterResult BuildReduceScatter(
     reduce_result = xla::ReduceScatter(
         token_handler.GetInput(input, &input_shape),
         GetReduceComutation(reduce_type, input_shape.element_type()),
-        scatter_dim, shard_count, reduce_groups, /*channel_id=*/absl::nullopt,
-        /*layout=*/reduce_shape.layout());
+        scatter_dim, shard_count, reduce_groups, channel_handle,
+        /*layout=*/reduce_shape.layout(), use_global_device_ids);
   } else {
     reduce_result = xla::ReduceScatter(
         token_handler.GetInput(input, &input_shape),
         GetReduceComutation(reduce_type, input_shape.element_type()),
-        scatter_dim, shard_count, reduce_groups);
+        scatter_dim, shard_count, reduce_groups, channel_handle,
+        /*layout=*/std::nullopt, use_global_device_ids);
   }
 
   if (scale != 1.0) {
 
@@ -75,10 +75,11 @@ AllToAllResult BuildAllToAll(xla::XlaOp input, xla::XlaOp token,
                              const std::vector<std::vector<int64_t>>& groups,
                              bool pin_layout);
 
-AllGatherResult BuildAllGather(xla::XlaOp input, xla::XlaOp token, int64_t dim,
-                               int64_t shard_count,
-                               const std::vector<std::vector<int64_t>>& groups,
-                               bool pin_layout);
+AllGatherResult BuildAllGather(
+    xla::XlaOp input, xla::XlaOp token, int64_t dim, int64_t shard_count,
+    const std::vector<std::vector<int64_t>>& groups, bool pin_layout,
+    std::optional<int64_t> channel_id = std::nullopt,
+    std::optional<bool> use_global_device_ids = std::nullopt);
 
 AllGatherResultCoalesced BuildAllGatherCoalesced(
     absl::Span<const xla::XlaOp> inputs, xla::XlaOp token, int64_t dim,
@@ -98,7 +99,9 @@ RecvResult BuildRecvWithToken(xla::XlaOp token, const xla::Shape& recv_shape,
 ReduceScatterResult BuildReduceScatter(
     AllReduceType reduce_type, xla::XlaOp input, xla::XlaOp token, double scale,
     int64_t scatter_dim, int64_t shard_count,
-    const std::vector<std::vector<int64_t>>& groups, bool pin_layout);
+    const std::vector<std::vector<int64_t>>& groups, bool pin_layout,
+    std::optional<int64_t> channel_id = std::nullopt,
+    std::optional<bool> use_global_device_ids = std::nullopt);
 
 xla::XlaOp BuildReduceScatter(AllReduceType reduce_type, xla::XlaOp input,
                               double scale, int64_t scatter_dim,
 
@@ -373,13 +373,16 @@ std::pair<at::Tensor, std::shared_ptr<torch::lazy::Value>> ReduceScatter(
     const std::string& reduce_type, const at::Tensor& input,
     const std::shared_ptr<torch::lazy::Value>& token, double scale,
     int64_t scatter_dim, int64_t shard_count,
-    const std::vector<std::vector<int64_t>>& replica_groups, bool pin_layout) {
+    const std::vector<std::vector<int64_t>>& replica_groups, bool pin_layout,
+    std::optional<int64_t> channel_id = std::nullopt,
+    std::optional<bool> use_global_device_ids = std::nullopt) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   XLATensorPtr result;
   torch::lazy::Value new_token;
   std::tie(result, new_token) = tensor_methods::reduce_scatter(
       bridge::GetXlaTensor(input), *token, GetReduceType(reduce_type), scale,
-      scatter_dim, shard_count, replica_groups, pin_layout);
+      scatter_dim, shard_count, replica_groups, pin_layout, channel_id,
+      use_global_device_ids);
   return std::pair<at::Tensor, std::shared_ptr<torch::lazy::Value>>(
       bridge::AtenFromXlaTensor(std::move(result)),
       std::make_shared<torch::lazy::Value>(new_token));
@@ -437,11 +440,13 @@ std::shared_ptr<torch::lazy::Value> ReduceScatterCoalescedOut(
 
 at::Tensor AllGather(const at::Tensor& input, int64_t dim, int64_t shard_count,
                      const std::vector<std::vector<int64_t>>& replica_groups,
-                     bool pin_layout) {
+                     bool pin_layout,
+                     std::optional<int> channel_id = std::nullopt,
+                     std::optional<bool> use_global_device_ids = std::nullopt) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
-  auto result =
-      tensor_methods::all_gather(bridge::GetXlaTensor(input), dim, shard_count,
-                                 replica_groups, pin_layout);
+  auto result = tensor_methods::all_gather(
+      bridge::GetXlaTensor(input), dim, shard_count, replica_groups, pin_layout,
+      channel_id, use_global_device_ids);
   return bridge::AtenFromXlaTensor(std::move(result));
 }
 
@@ -1659,18 +1664,21 @@ void InitXlaModuleBindings(py::module m) {
           result_tuple[1] = new_token;
           return result_tuple;
         });
-  m.def("_xla_all_gather", [](const at::Tensor& input, int64_t dim,
-                              int64_t shard_count, const py::list& groups,
-                              bool pin_layout) {
-    std::vector<std::vector<int64_t>> replica_groups =
-        CreateReduceGroups(groups);
-    at::Tensor result;
-    {
-      NoGilSection nogil;
-      result = AllGather(input, dim, shard_count, replica_groups, pin_layout);
-    }
-    return result;
-  });
+  m.def("_xla_all_gather",
+        [](const at::Tensor& input, int64_t dim, int64_t shard_count,
+           const py::list& groups, bool pin_layout,
+           std::optional<int> channel_id = std::nullopt,
+           std::optional<bool> use_global_device_ids = std::nullopt) {
+          std::vector<std::vector<int64_t>> replica_groups =
+              CreateReduceGroups(groups);
+          at::Tensor result;
+          {
+            NoGilSection nogil;
+            result = AllGather(input, dim, shard_count, replica_groups,
+                               pin_layout, channel_id, use_global_device_ids);
+          }
+          return result;
+        });
   m.def("_xla_all_gather_out",
         [](at::Tensor& output, const at::Tensor& input,
            const std::shared_ptr<torch::lazy::Value>& token, int64_t dim,
@@ -1788,16 +1796,17 @@ void InitXlaModuleBindings(py::module m) {
         [](const std::string& reduce_type, const at::Tensor& input,
            const std::shared_ptr<torch::lazy::Value>& token, double scale,
            int64_t scatter_dim, int64_t shard_count, const py::list& groups,
-           bool pin_layout) {
+           bool pin_layout, std::optional<int64_t> channel_id = std::nullopt,
+           std::optional<bool> use_global_device_ids = std::nullopt) {
           std::vector<std::vector<int64_t>> replica_groups =
               CreateReduceGroups(groups);
           at::Tensor result;
           std::shared_ptr<torch::lazy::Value> new_token;
           {
             NoGilSection nogil;
-            std::tie(result, new_token) =
-                ReduceScatter(reduce_type, input, token, scale, scatter_dim,
-                              shard_count, replica_groups, pin_layout);
+            std::tie(result, new_token) = ReduceScatter(
+                reduce_type, input, token, scale, scatter_dim, shard_count,
+                replica_groups, pin_layout, channel_id, use_global_device_ids);
           }
           auto result_tuple = py::tuple(2);
           result_tuple[0] = torch::autograd::make_variable(