composibility of assume_pure and call_jax (#8989)

qihqi · web-flow · commit 8e6a5e5ddc58 · 2025-04-21T17:28:52.000Z
diff --git a/.github/workflows/torchax.yml b/.github/workflows/torchax.yml
@@ -56,4 +56,5 @@ jobs:
           pytest test/test_libraries.py
           pytest test/test_symbolic_shapes.py
           pytest test/test_exports.py
+          pytest test/test_util.py
           XLA_FLAGS=--xla_force_host_platform_device_count=4 pytest -n 0 test_dist/
diff --git a/test/scan/test_scan_spmd.py b/test/scan/test_scan_spmd.py
@@ -7,6 +7,7 @@
 import torch_xla
 import torch.nn as nn
 from torch_xla.distributed.spmd.xla_sharding import apply_xla_patch_to_nn_linear, Mesh
+from torch_xla.experimental.assume_pure import assume_pure
 from torch_xla.experimental.scan import scan
 from torch_xla.experimental.scan_layers import scan_layers
 from torch_xla.distributed.spmd import mark_sharding, mark_sharding_with_gradients, set_global_mesh, get_1d_mesh, get_global_mesh
@@ -229,6 +230,19 @@ def check_dots_in_model(self, model, x, expect_pattern):
   def count_regex(self, hlo_text, regex_str):
     return len(re.findall(regex_str, hlo_text))
 
+  def test_assume_pure_works_with_mark_sharding(self):
+    x = torch.randn((3, 4, 5, 128), device='xla')
+    assume_pure(mark_sharding)(x, self.spmd_mesh, ("model", None, None, None))
+    # assert not throwing
+
+  def test_convert_to_jax_mesh(self):
+    jax_mesh = self.spmd_mesh.maybe_convert_and_get_jax_mesh()
+    self.assertEqual(jax_mesh.devices.shape, self.spmd_mesh.mesh_shape)
+    np.testing.assert_equal(
+        np.array([dev.id for dev in jax_mesh.devices.flatten()]),
+        self.spmd_mesh.device_ids)
+    # assert not throwing
+
 
 if __name__ == '__main__':
   test = unittest.main()
diff --git a/test/test_assume_pure.py b/test/test_assume_pure.py
@@ -369,6 +369,44 @@ def original_func(a, b):
     self.assertIsNone(a_pure.grad)
     self.assertIsNone(b_pure.grad)
 
+  def test_composibility_with_call_jax(self):
+
+    def jax_func(a, b):
+      return jnp.dot(a, b)
+
+    def f(a, b):
+      return xb.call_jax(jax_func, (a, b))
+
+    a = torch.randn(3, 3, device='xla')
+    b = torch.randn(3, 3, device='xla')
+
+    output_pure = assume_pure(f)(a, b)
+    torch.testing.assert_close(
+        output_pure,
+        a @ b,
+        msg="Forward outputs do not match",
+        check_device=False)
+
+  def test_assume_pure_recursive(self):
+
+    @assume_pure
+    def torch_func(a, b):
+      return torch.matmul(a, b)
+
+    def f(a, b):
+      y = torch_func(a, b)
+      return y + 1
+
+    a = torch.randn(3, 3, device='xla')
+    b = torch.randn(3, 3, device='xla')
+
+    output_pure = assume_pure(f)(a, b)
+    torch.testing.assert_close(
+        output_pure,
+        a @ b + 1,
+        msg="Forward outputs do not match",
+        check_device=False)
+
 
 FLAGS = flags.FLAGS
 flags.DEFINE_integer(
@@ -436,5 +474,6 @@ def pure_call(params, x):
   torch_xla._XLAC._xla_set_mat_mul_precision('highest')
   jax_import_guard()
   import torchax
+  import jax.numpy as jnp
   torchax.enable_accuracy_mode()
   absltest.main()
diff --git a/torch_xla/_internal/jax_workarounds.py b/torch_xla/_internal/jax_workarounds.py
@@ -42,3 +42,16 @@ def jax_env_context():
       os.environ['SKIP_MEGASCALE_PJRT_CLIENT'] = previous_skip_megascale_env
     else:
       os.environ.pop('SKIP_MEGASCALE_PJRT_CLIENT', None)
+
+
+def maybe_get_torchax():
+  try:
+    jax_import_guard()
+    with jax_env_context():
+      import torchax
+      import torchax.tensor
+      import torchax.interop
+      import torchax.ops.mappings
+      return torchax
+  except ImportError:
+    return None
diff --git a/torch_xla/core/xla_builder.py b/torch_xla/core/xla_builder.py
@@ -4,7 +4,7 @@
 import torch
 import torch_xla
 from torch.utils._pytree import tree_flatten
-from torch_xla._internal.jax_workarounds import jax_env_context, jax_import_guard, requires_jax
+from torch_xla._internal.jax_workarounds import jax_env_context, jax_import_guard, requires_jax, maybe_get_torchax
 
 
 class Type:
@@ -869,7 +869,7 @@ def jax_func_to_xla_computation(jax_func, args, kwargs, name=None):
   # then the MegaScale device discovery will hang.
   with jax_env_context():
     import jax
-    import torchax.ops.mappings as mappings
+    tx = maybe_get_torchax()
 
     flattened_inputs, spec = jax.tree.flatten((args, kwargs))
 
@@ -878,7 +878,7 @@ def abstractify(a):  # make a pytree leaf abstract
         return None
       if isinstance(a, torch.Tensor):
         assert a.device.type == 'xla', f"Inputs must be XLA tensors. Got {a.device}"
-        return jax.ShapeDtypeStruct(a.shape, mappings.t2j_dtype(a.dtype))
+        return jax.ShapeDtypeStruct(a.shape, tx.ops.mappings.t2j_dtype(a.dtype))
       return a
 
     sample_inputs = tuple(abstractify(a) for a in flattened_inputs)
@@ -1019,6 +1019,10 @@ def call_jax(jax_func,
   import jax
   kwargs = kwargs or {}
   flattened, _spec = jax.tree.flatten((args, kwargs))
+  tx = maybe_get_torchax()
+  if tx is not None and any(isinstance(a, tx.tensor.Tensor) for a in flattened):
+    return tx.interop.call_jax(jax_func, *args, **kwargs)
+
   xla_computation = jax_func_to_xla_computation(jax_func, args, kwargs, name)
   return xla_computation(flattened)
 
diff --git a/torch_xla/distributed/spmd/xla_sharding.py b/torch_xla/distributed/spmd/xla_sharding.py
@@ -13,7 +13,7 @@
 from torch_xla.distributed.spmd import XLAShardedTensor, XLAShard
 import torch_xla.runtime as xr
 import torch_xla.debug.profiler as xp
-from torch_xla._internal.jax_workarounds import requires_jax
+from torch_xla._internal.jax_workarounds import requires_jax, maybe_get_torchax
 
 import numpy as np
 import functools
@@ -181,6 +181,29 @@ def from_str(cls, mesh_str: str) -> Optional["Mesh"]:
     except (ValueError, SyntaxError, KeyError, TypeError):
       return None
 
+  @requires_jax
+  def maybe_convert_and_get_jax_mesh(self):
+    # Construct a JAX mesh object with the same device ids shape and ordering
+    # from torch_xla device mesh.
+    import jax
+    import numpy as np
+    from jax._src import mesh as mesh_lib
+
+    axis_names = self.axis_names or tuple(
+        str(i) for i in range(len(self.mesh_shape)))
+
+    # Create a mapping from device ID to device object
+    all_devices = jax.devices()
+    device_id_to_device = {device.id: device for device in all_devices}
+    device_ids_array = self.device_ids.reshape(*self.mesh_shape)
+    device_array = np.empty(device_ids_array.shape, dtype=object)
+    device_array = np.vectorize(device_id_to_device.get)(device_ids_array)
+    if np.any(device_array == None):
+      raise ValueError(
+          f"torch_xla device ID {device_ids_array[device_array == None]} not found in available JAX devices"
+      )
+    return mesh_lib.Mesh(device_array, axis_names=axis_names)
+
 
 _GLOBAL_MESH: Optional[Mesh] = None
 
@@ -584,6 +607,14 @@ def mark_sharding(t: Union[torch.Tensor, XLAShardedTensor], mesh: Mesh,
   assert len(t.shape) == len(partition_spec), \
     f"Partition spec length ({len(partition_spec)}) should be equal to the input rank ({len(t.shape)})."
 
+  tx = maybe_get_torchax()
+  if tx is not None and isinstance(t, tx.tensor.Tensor):
+    from jax.sharding import PartitionSpec as P, NamedSharding
+    op_sharding = tuple(str(i) if i is not None else i for i in partition_spec)
+    jmesh = mesh.maybe_convert_and_get_jax_mesh()
+    t.shard_(NamedSharding(jmesh, P(*op_sharding)))
+    return t
+
   op_sharding = mesh.get_op_sharding(partition_spec)
   annotate_func = torch_xla._XLAC._xla_mark_sharding
   annotate_func(unwrap_sharded_tensor(t), op_sharding)
diff --git a/torch_xla/experimental/splash_attention.py b/torch_xla/experimental/splash_attention.py
@@ -63,32 +63,6 @@ def list_to_tuple(x):
     converted_data = {k: list_to_tuple(v) for k, v in json_data.items()}
     return SplashAttentionConfig(**converted_data)
 
-  @requires_jax
-  def maybe_convert_and_get_jax_mesh(self):
-    # Construct a JAX mesh object with the same device ids shape and ordering
-    # from torch_xla device mesh.
-    mesh = Mesh.from_str(self.mesh)
-    import jax
-    import numpy as np
-    from jax._src import mesh as mesh_lib
-
-    assert mesh.axis_names is not None, "Omitting axis names is not yet supported"
-
-    # Create a mapping from device ID to device object
-    all_devices = jax.devices()
-    device_id_to_device = {device.id: device for device in all_devices}
-    device_ids_array = mesh.device_ids.reshape(*mesh.mesh_shape)
-    device_array = np.empty(device_ids_array.shape, dtype=object)
-    for idx in np.ndindex(device_ids_array.shape):
-      device_id = device_ids_array[idx]
-      if device_id in device_id_to_device:
-        device_array[idx] = device_id_to_device[device_id]
-      else:
-        raise ValueError(
-            f"torch_xla device ID {device_id} not found in available JAX devices"
-        )
-    return mesh_lib.Mesh(device_array, axis_names=mesh.axis_names)
-
 
 @xp.trace_me("splash_attention_kernel_wrapper")
 def splash_attention_jax_wrapper(
@@ -112,7 +86,7 @@ def splash_attention_jax_wrapper(
       splash_attention_kernel,
       splash_attention_mask,
   )
-  mesh = config.maybe_convert_and_get_jax_mesh()
+  mesh = Mesh.from_str(config.mesh).maybe_convert_and_get_jax_mesh()
   # input q,k,v shape: [batch, #head, seq_len, head_dim]
   if decoder_segment_ids is not None and not decoder_segment_ids.shape:
     decoder_segment_ids = None
diff --git a/torchax/test/test_util.py b/torchax/test/test_util.py
@@ -0,0 +1,107 @@
+import unittest
+from torchax.util import partition, merge
+
+# Helper predicate functions for testing partition
+def is_even(n):
+    return isinstance(n, int) and n % 2 == 0
+
+def is_positive(n):
+    return isinstance(n, (int, float)) and n > 0
+
+def is_string(s):
+    return isinstance(s, str)
+
+
+class TestListUtils(unittest.TestCase):
+
+    # --- Tests for partition ---
+
+    def test_partition_empty_list(self):
+        """Test partition with an empty list."""
+        self.assertEqual(partition([], is_even), ([], []))
+
+    def test_partition_even_odd(self):
+        """Test partitioning numbers into even and odd."""
+        nums = [1, 2, 3, 4, 5, 6]
+        expected_truthy = [None, 2, None, 4, None, 6]
+        expected_falsy = [1, None, 3, None, 5, None]
+        self.assertEqual(partition(nums, is_even), (expected_truthy, expected_falsy))
+
+    def test_partition_all_true(self):
+        """Test partition when the predicate is always true."""
+        evens = [2, 4, 6, 8]
+        expected_truthy = [2, 4, 6, 8]
+        expected_falsy = [None, None, None, None]
+        self.assertEqual(partition(evens, is_even), (expected_truthy, expected_falsy))
+
+    def test_partition_all_false(self):
+        """Test partition when the predicate is always false."""
+        odds = [1, 3, 5, 7]
+        expected_truthy = [None, None, None, None]
+        expected_falsy = [1, 3, 5, 7]
+        self.assertEqual(partition(odds, is_even), (expected_truthy, expected_falsy))
+
+    def test_partition_mixed_types(self):
+        """Test partition with a list of mixed types."""
+        mixed = [1, "hello", 2.5, "world", 3, None]
+        # Using is_string as the predicate
+        expected_truthy = [None, "hello", None, "world", None, None]
+        expected_falsy = [1, None, 2.5, None, 3, None] # Note: None itself is not a string
+        self.assertEqual(partition(mixed, is_string), (expected_truthy, expected_falsy))
+
+    def test_partition_with_lambda(self):
+        """Test partition using a lambda function as the predicate."""
+        nums = [-2, -1, 0, 1, 2]
+        expected_truthy = [None, None, None, 1, 2]
+        expected_falsy = [-2, -1, 0, None, None]
+        self.assertEqual(partition(nums, lambda x: isinstance(x, int) and x > 0), (expected_truthy, expected_falsy))
+
+    # --- Tests for merge ---
+
+    def test_merge_empty_lists(self):
+        """Test merge with empty lists."""
+        self.assertEqual(merge([], []), [])
+
+    def test_merge_basic(self):
+        """Test basic merging with None values in the first list."""
+        list1 = [1, None, 3, None, 5]
+        list2 = [None, 2, None, 4, None]
+        expected = [1, 2, 3, 4, 5]
+        self.assertEqual(merge(list1, list2), expected)
+
+    def test_merge_no_none_in_list1(self):
+        """Test merge when list1 has no None values."""
+        list1 = ['a', 'b', 'c']
+        list2 = [1, 2, 3]
+        expected = ['a', 'b', 'c'] # Should be identical to list1
+        self.assertEqual(merge(list1, list2), expected)
+
+    def test_merge_all_none_in_list1(self):
+        """Test merge when list1 contains only None."""
+        list1 = [None, None, None]
+        list2 = ['x', 'y', 'z']
+        expected = ['x', 'y', 'z'] # Should be identical to list2
+        self.assertEqual(merge(list1, list2), expected)
+
+    def test_merge_mixed_types(self):
+        """Test merge with mixed data types."""
+        list1 = [1, None, "hello", None]
+        list2 = [None, 2.5, None, True]
+        expected = [1, 2.5, "hello", True]
+        self.assertEqual(merge(list1, list2), expected)
+
+    def test_merge_unequal_lengths(self):
+        """Test that merge raises AssertionError for lists of unequal length."""
+        list1 = [1, 2, 3]
+        list2 = [4, 5]
+        # Use assertRaises as a context manager
+        with self.assertRaises(AssertionError) as cm:
+            merge(list1, list2)
+
+        list3 = [6, 7]
+        list4 = [8, 9, 10]
+        with self.assertRaises(AssertionError):
+            merge(list3, list4) # No need to check message again if already checked
+
+if __name__ == '__main__':
+  unittest.main() # For running from command line
diff --git a/torchax/torchax/environment.py b/torchax/torchax/environment.py
diff --git a/torchax/torchax/interop.py b/torchax/torchax/interop.py
diff --git a/torchax/torchax/util.py b/torchax/torchax/util.py