Cache HLO in xb.call_jax and support non-tensor args (#8878)

tengyifei · web-flow · commit a3ef52e5d333 · 2025-03-24T16:40:48.000-07:00
diff --git a/test/test_jax_interop.py b/test/test_jax_interop.py
@@ -103,6 +103,117 @@ def f_jax(a, b):
     # backward should produce same gradient
     torch.testing.assert_close(out_grad_torch, out_grad_jax)
 
+  def test_call_jax_non_tensor_args(self):
+    """Test that call_jax works with non-tensor arguments."""
+
+    dev = xm.xla_device()
+    a = torch.ones((3, 3), device=dev)
+
+    def f(a, num: float, string: str, dictionary: dict, none):
+      assert isinstance(string, str)
+      import jax.numpy as jnp
+      if none is None:
+        return a + jnp.sin(num) + int(string) + dictionary['x']
+      raise ValueError('none should be None')
+
+    b = xb.call_jax(
+        f, (
+            a,
+            1.0,
+            "10",
+            {
+                "x": torch.tensor(0.25, device=dev)
+            },
+        ),
+        kwargs={"none": None})
+    torch_xla.sync()
+    torch.testing.assert_close(
+        b, torch.sin(torch.ones(3, 3)) + 1 + 10 + 0.25, check_device=False)
+
+  def test_call_jax_cache_hlo(self):
+    """Test that the HLO of a jax function should be cached."""
+
+    starting_cache_misses = xb._jax_to_hlo_cache_num_misses()
+
+    # Let's trace two different jax functions a couple of times.
+    dev = xm.xla_device()
+    a = torch.ones((3, 3), device=dev)
+
+    def f(a, b):
+      import jax.numpy as jnp
+      return a + jnp.sin(b)
+
+    def g(a, b):
+      import jax.numpy as jnp
+      return a + jnp.cos(b)
+
+    xb.call_jax(f, (a, a))
+    xb.call_jax(f, (a, a))
+    xb.call_jax(g, (a, a))
+    xb.call_jax(g, (a, a))
+
+    ending_cache_misses = xb._jax_to_hlo_cache_num_misses()
+    self.assertEqual(ending_cache_misses - starting_cache_misses, 2)
+
+  def test_call_jax_cache_by_shape(self):
+    """Test that the same function may be traced again if the shape of its arguments changes."""
+
+    starting_cache_misses = xb._jax_to_hlo_cache_num_misses()
+
+    # Let's trace the same jax function with different shapes.
+    dev = xm.xla_device()
+    a = torch.ones((3, 3), device=dev)
+    b = torch.ones((2, 2), device=dev)
+
+    def f(a, b):
+      import jax.numpy as jnp
+      return a + jnp.sin(b)
+
+    xb.call_jax(f, (a, a))
+    xb.call_jax(f, (b, b))
+
+    ending_cache_misses = xb._jax_to_hlo_cache_num_misses()
+    self.assertEqual(ending_cache_misses - starting_cache_misses, 2)
+
+  def test_call_jax_cache_by_tree_spec(self):
+    """Test that the same function may be traced again if the tree spec of its arguments changes."""
+    starting_cache_misses = xb._jax_to_hlo_cache_num_misses()
+
+    # Let's trace the same jax function with different tree specs.
+    dev = xm.xla_device()
+    a = torch.ones((3, 3), device=dev)
+    b = torch.ones((3, 2), device=dev)
+
+    def f(inputs):
+      a = inputs['a']
+      b = inputs['b']
+      return a @ b
+
+    xb.call_jax(f, ({'a': a, 'b': a},))
+    xb.call_jax(f, ({'a': a, 'b': b},))
+
+    ending_cache_misses = xb._jax_to_hlo_cache_num_misses()
+    self.assertEqual(ending_cache_misses - starting_cache_misses, 2)
+
+  def test_call_jax_cache_by_static_args(self):
+    """Test that the same function may be traced again if a non-tensor argument changes."""
+    starting_cache_misses = xb._jax_to_hlo_cache_num_misses()
+
+    # Let's trace the same jax function with different static args.
+    dev = xm.xla_device()
+    a = torch.ones((3, 3), device=dev)
+
+    def f(a, num: float):
+      import jax.numpy as jnp
+      return a + jnp.sin(num)
+
+    xb.call_jax(f, (a, 1.0))
+    xb.call_jax(f, (a, 2.0))
+    xb.call_jax(f, (a, 3.0))
+
+    ending_cache_misses = xb._jax_to_hlo_cache_num_misses()
+    self.assertEqual(ending_cache_misses - starting_cache_misses, 3)
+
 
 if __name__ == "__main__":
   absltest.main()
diff --git a/torch_xla/core/xla_builder.py b/torch_xla/core/xla_builder.py
@@ -1,7 +1,10 @@
+from copy import copy
+from typing import Any, Optional
+from weakref import WeakKeyDictionary
 import torch
 import torch_xla
 from torch.utils._pytree import tree_flatten, tree_unflatten
-from torch_xla.experimental.custom_kernel import jax_import_guard
+from torch_xla.experimental.custom_kernel import _jax_env_context, jax_import_guard
 
 
 class Type:
@@ -827,21 +830,23 @@ def get_computation_hlo(computation):
 
 class XlaComputation:
 
-  def __init__(self, name, hlo_module, flattened_inputs):
+  def __init__(self, name, hlo_module, flattened_inputs, pick_tensor_args):
     self.num_inputs = len(flattened_inputs)
     builder = create_builder(name)
     computation = computation_from_module_proto(name, hlo_module)
     params = []
     for idx, val in enumerate(flattened_inputs):
       params.append(mkparam(builder, idx, tensor_shape(val)))
     call_op = Op.call(computation, params)
-    call_computation = call_op.build('call_jax')
+    call_computation = call_op.build(f'call_jax_{name}')
     self.call_computation = call_computation
     self.name = name
+    self.pick_tensor_args = pick_tensor_args
 
   def __call__(self, input_list):
+    input_tensors = self.pick_tensor_args(input_list)
     result = torch_xla._XLAC._xla_user_computation(f'xla::call_jax_{self.name}',
-                                                   input_list,
+                                                   input_tensors,
                                                    self.call_computation)
     if isinstance(result, list) and len(result) == 1:
       return result[0]
@@ -855,32 +860,142 @@ def jax_func_to_xla_computation(jax_func, args, kwargs, name=None):
   # If we don't do this before calling jax, any torch_xla operation will hang.
   jax_import_guard()
 
-  import jax
-  import torchax.ops.mappings as mappings
-
-  flattened, spec = tree_flatten((args, kwargs))
-
-  def fn_flattened_inputs(*flattened):
-    args, kwargs = tree_unflatten(flattened, spec)
-    return jax_func(*args, **kwargs)
-
-  sample_input_shapes = tuple(
-      jax.ShapeDtypeStruct(a.shape, mappings.t2j_dtype(a.dtype))
-      for a in flattened)
-  # `as_serialized_hlo_module_proto` is mentioned at
-  # https://github.com/jax-ml/jax/discussions/22266
-  hlo_module = jax.jit(
-      fn_flattened_inputs,
-      keep_unused=True).lower(*sample_input_shapes).compiler_ir(
-          'hlo').as_serialized_hlo_module_proto()  # type: ignore
-
-  return XlaComputation(name, hlo_module, flattened)
-
-
-def call_jax(jax_func, args, kwargs=None, name=None):
+  # Prevent JAX from discovering MegaScale devices a second time. If we don't do this,
+  # then the MegaScale device discovery will hang.
+  with _jax_env_context():
+    import jax
+    import torchax.ops.mappings as mappings
+
+    flattened_inputs, spec = jax.tree.flatten((args, kwargs))
+
+    def abstractify(a):  # make a pytree leaf abstract
+      import jax
+      import torch_xla
+      if a is None:
+        return None
+      if isinstance(a, torch.Tensor):
+        assert a.device == torch_xla.device(
+        ), f"Inputs must be XLA tensors. Got {a.device}"
+        return jax.ShapeDtypeStruct(a.shape, mappings.t2j_dtype(a.dtype))
+      return a
+
+    sample_inputs = tuple(abstractify(a) for a in flattened_inputs)
+
+    # Pick out the non-static args.
+    # Consider anything that is not a `jax.ShapeDtypeStruct` as a static arg.
+    def pick_tensor_args(flattened_args):
+      tensor_args = []
+      for i in range(len(sample_inputs)):
+        if isinstance(sample_inputs[i], jax.ShapeDtypeStruct):
+          tensor_args.append(flattened_args[i])
+      return tensor_args
+
+    sample_tensor_args = pick_tensor_args(sample_inputs)
+    tensor_args = pick_tensor_args(flattened_inputs)
+
+    # This function only takes in tensor arguments because its signature must
+    # match the signature of the HLO module lowered from JAX, allowing us to
+    # wrap it in an XLA user computation.
+    def fn(*tensor_args):
+      # Go from a list of tensor args to the full list of flattened arguments,
+      # by referencing the original flattened inputs.
+      new_flattened = copy(flattened_inputs)
+      tensor_args_iter = iter(tensor_args)
+      for i in range(len(sample_inputs)):
+        if isinstance(sample_inputs[i], jax.ShapeDtypeStruct):
+          new_flattened[i] = next(tensor_args_iter)
+      args, kwargs = jax.tree.unflatten(spec, new_flattened)
+      return jax_func(*args, **kwargs)
+
+    def get_hlo():
+      import torch_xla.debug.profiler as xp
+      # If we see this trace span in the profiler, we'll know that there's a cache miss.
+      with xp.Trace('jax_to_hlo'):
+        hlo_ir = jax.jit(
+            fn, keep_unused=True).lower(*sample_tensor_args).compiler_ir('hlo')
+
+        # Get a protobuf representation of the HLO. `as_serialized_hlo_module_proto` is
+        # mentioned at https://github.com/jax-ml/jax/discussions/22266
+        return hlo_ir.as_serialized_hlo_module_proto()  # type: ignore
+
+    hlo_module = _jax_to_hlo_cache_get_or_insert(jax_func, sample_inputs, spec,
+                                                 get_hlo)
+    return XlaComputation(name, hlo_module, tensor_args, pick_tensor_args)
+
+
+def _jax_to_hlo_cache_get_or_insert(jax_func, sample_inputs: tuple[Any, ...],
+                                    input_tree_spec, get_hlo):
+  global _JAX_TO_HLO_CACHE
+  # Use two layers of dictionary lookup.
+  # The first layer uses the `jax_func`, which is only weakly referenced.
+  # The second layer uses the sample inputs and the tree spec, which is strongly referenced.
+  inner_dict = _JAX_TO_HLO_CACHE.get(jax_func, None)
+  if inner_dict is not None:
+    hlo = inner_dict.get((sample_inputs, input_tree_spec), None)
+    if hlo is not None:
+      return hlo
+
+  # Compget_hlo jax function to HLO.
+  hlo = get_hlo()
+  if inner_dict is None:
+    _JAX_TO_HLO_CACHE[jax_func] = {}
+  _JAX_TO_HLO_CACHE[jax_func][(sample_inputs, input_tree_spec)] = hlo
+  return hlo
+
+
+def _jax_to_hlo_cache_num_misses() -> int:
+  size = 0
+  for inner_dict in _JAX_TO_HLO_CACHE.values():
+    size += len(inner_dict)
+  return size
+
+
+_JAX_TO_HLO_CACHE = WeakKeyDictionary()
+
+
+def call_jax(jax_func,
+             args: tuple[Any, ...],
+             kwargs: Optional[dict[str, Any]] = None,
+             name=None):
   """
   Call a JAX function `jax_func` with the given `args` and `kwargs` that may contain
   XLA tensors.
+
+  Args:
+    jax_func: a functionally pure Python callable that does some math on JAX arrays.
+              It needs to be `jax.jit` traceable.
+
+    args: a tuple of arguments to pass to `jax_func`. Any XLA tensors are turned into
+          JAX arrays before being passed to `jax_func`.
+
+    kwargs: a dictionary of keyword arguments to pass to `jax_func`. Any XLA tensors are
+          turned into JAX arrays before being passed to `jax_func`.
+
+  ## Example
+
+      >>> import torch
+      >>> import torch_xla
+      >>> import torch_xla.core.xla_builder as xb
+      >>>
+      >>> def f(a, b):
+      >>>   # Call any JAX functionality here.
+      >>>   import jax.numpy as jnp
+      >>>   return a + jnp.sin(b)
+      >>>
+      >>> # Pass PyTorch/XLA tensors to JAX function this way.
+      >>> a = torch.ones((3, 3), device='xla')
+      >>> b = xb.call_jax(f, (a, a))
+      >>>
+      >>> # Result is the same as if we ran the equivalent torch ops.
+      >>> torch.testing.assert_close(b.cpu(), torch.sin(torch.ones(3, 3)) + 1)
+
+  ## Caching
+
+  In order to call `jax_func`, we will jit compile it into HLO, which involves tracing
+  the function. The address of `jax_func` and the shapes of `args` and `kwargs` is used
+  as the key into a cache to avoid repeated tracing/compilation, similar to how `jax.jit`
+  works. If you get tracing overhead, check if `jax_func` is being redefined all the time.
+  A common mistake is defining `jax_func` as a local function, e.g. during a training step.
   """
 
   kwargs = kwargs or {}
diff --git a/torch_xla/experimental/custom_kernel.py b/torch_xla/experimental/custom_kernel.py
@@ -160,6 +160,7 @@ def _extract_backend_config(
 
 @contextmanager
 def _jax_env_context():
+  # TODO(b/374631442): Get rid of this hack.
   try:
     previous_skip_megascale_env = os.environ.get('SKIP_MEGASCALE_PJRT_CLIENT',
                                                  None)