Fix unnecessary memory copies between GPU and CPU when jax2tf.call_tf() is used.

pclove1 · jax authors · commit e00149c39f0a · 2024-02-27T10:35:50.000-08:00
- The root cause of the bug is that dtype lookups are incorrect because hashes behave differently between dtype instances and their types. Added comments to `jax.dlpack.SUPPORTED_DTYPES` about this.
- Added unit test coverage.
- Fixing this bug revealed a limitation of causing "host-to-device" copy in the following two situations. See the details in the unit test comments.:
  - When the dtype is 'int32'.
  - When using PJRT C API runtime.

PiperOrigin-RevId: 610799558
diff --git a/jax/_src/dlpack.py b/jax/_src/dlpack.py
@@ -26,6 +26,13 @@
 from jax._src.typing import Array
 
 
+# A set of dtypes that dlpack supports.
+# Note: Make sure to use a "type", not a dtype instance, when looking up this set
+# because their hashes are different.
+# For example,
+# hash(jnp.float32) != hash(jnp.dtype(jnp.float32))
+# hash(jnp.float32) == hash(jnp.dtype(jnp.float32).type)
+# TODO(phawkins): Migrate to using dtypes instead of the scalar type objects.
 SUPPORTED_DTYPES = frozenset({
     jnp.int8, jnp.int16, jnp.int32, jnp.int64, jnp.uint8, jnp.uint16,
     jnp.uint32, jnp.uint64, jnp.float16, jnp.bfloat16, jnp.float32,
@@ -76,7 +83,6 @@ def to_dlpack(x: Array, take_ownership: bool = False,
   )  # type: ignore
 
 
-
 def from_dlpack(external_array):
   """Returns a :class:`~jax.Array` representation of a DLPack tensor.
 
diff --git a/jax/experimental/jax2tf/call_tf.py b/jax/experimental/jax2tf/call_tf.py
@@ -40,6 +40,7 @@
 from jax._src import core
 from jax._src import effects
 from jax._src import util
+from jax._src import xla_bridge
 from jax._src.lib import xla_client
 from jax._src.lib.mlir import ir
 from jax._src.lib.mlir.dialects import func as func_dialect
@@ -332,7 +333,7 @@ def _call_tf_impl(*args_jax_flat, callable_flat_tf, **_):
   def _arg_jax_to_tf(arg_jax):
     if (isinstance(arg_jax, jax.Array) and
         list(arg_jax.devices())[0].platform in _DLPACK_PLATFORMS and
-        arg_jax.dtype in dlpack.SUPPORTED_DTYPES):
+        arg_jax.dtype.type in dlpack.SUPPORTED_DTYPES):
       arg_dlpack = jax.dlpack.to_dlpack(arg_jax, take_ownership=False)
       return tf.experimental.dlpack.from_dlpack(arg_dlpack)
     # The following avoids copies to the host on CPU, always for Array
@@ -349,11 +350,14 @@ def _arg_jax_to_tf(arg_jax):
     res_tf_flat = callable_flat_tf(*args_tf_flat)
 
   def _res_tf_to_jax(res_tf: TfVal):
-    res_tf, _ = jax2tf_internal._tfval_to_tensor_jax_dtype(res_tf)
-    if isinstance(res_tf, tf.Tensor) and res_tf.dtype in dlpack.SUPPORTED_DTYPES:
+    res_tf, jax_dtype = jax2tf_internal._tfval_to_tensor_jax_dtype(res_tf)
+    if isinstance(res_tf, tf.Tensor) and jax_dtype.type in dlpack.SUPPORTED_DTYPES:
       res_tf_platform = tf.DeviceSpec.from_string(res_tf.backing_device).device_type
       res_jax_platform = res_tf_platform.lower()
-      if res_jax_platform in _DLPACK_PLATFORMS:
+      # Skip using dlpack in PJRT C API runtime, because it currently fails
+      # with "PJRT C API does not support GetDefaultLayout".
+      # https://github.com/openxla/xla/blob/762bde36adf22792e91c38fe87cabe5af05bfadc/xla/pjrt/pjrt_c_api_client.h#L285-L289
+      if res_jax_platform in _DLPACK_PLATFORMS and not xla_bridge.using_pjrt_c_api():
         res_dlpack = tf.experimental.dlpack.to_dlpack(res_tf)
         return jax.dlpack.from_dlpack(res_dlpack)
 
diff --git a/jax/experimental/jax2tf/tests/call_tf_test.py b/jax/experimental/jax2tf/tests/call_tf_test.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Tests for call_tf."""
+
+import contextlib
 from functools import partial
 import os
 from typing import Callable
@@ -22,14 +24,16 @@
 from absl.testing import parameterized
 import jax
 from jax import config
+from jax import dlpack
 from jax import dtypes
 from jax import lax
 from jax import numpy as jnp
 from jax._src import test_util as jtu
+from jax._src import xla_bridge
 from jax._src.lib.mlir import ir
 from jax._src.lib.mlir.dialects import hlo
-from jax.experimental import jax2tf
 from jax.experimental import export
+from jax.experimental import jax2tf
 from jax.experimental.jax2tf.tests import tf_test_util
 import numpy as np
 
@@ -814,6 +818,49 @@ def f_jax(x):
           res = f_tf(x)
         self.assertAllClose(res, f_jax(x))
 
+  @parameterized.named_parameters(
+      {"testcase_name": f"_type={type_.__name__}", "type_": type_}
+      for type_ in dlpack.SUPPORTED_DTYPES
+  )
+  def test_avoid_copy_between_gpu_and_cpu(self, type_):
+    try:
+      gpu_devices = jax.devices("gpu")
+    except RuntimeError:
+      gpu_devices = []
+    if not gpu_devices:
+      raise unittest.SkipTest("Test requires a GPU device.")
+
+    def tf_fun(x):
+      if type_ == jnp.bool_:
+        return tf.math.logical_or(x, True)
+      else:
+        return x + 1
+
+    jax_array_on_gpu = jnp.zeros([1], type_, device=gpu_devices[0])
+
+    # Since the input array is already on a GPU device, we expect that no memory
+    # copy occurs between GPU and CPU. Thus, we expect no errors raised by the
+    # transfer guard.
+    # There are two exceptions:
+    # First, when dtype is "int32". This is because almost all TensorFlow
+    # kernels for GPU devices keep int32 tensors in host memory.
+    # (https://github.com/tensorflow/tensorflow/blob/4eb3e36d1b0cd511e1677e740bd093f42365cf9f/tensorflow/python/eager/pywrap_tensor.cc#L352-L354)
+    # Hence, for "int32", we do expect a "host-to-device" copy.
+    # Second, when using PJRT C API runtime. This is because it currently skips dlpack
+    # to workaround "PJRT C API does not support GetDefaultLayout" runtime error.
+    # https://github.com/openxla/xla/blob/762bde36adf22792e91c38fe87cabe5af05bfadc/xla/pjrt/pjrt_c_api_client.h#L285-L289
+    @contextlib.contextmanager
+    def _transfer_guard(guard_level):
+      with contextlib.ExitStack() as stack:
+        stack.enter_context(jax.transfer_guard_device_to_device(guard_level))
+        stack.enter_context(jax.transfer_guard_device_to_host(guard_level))
+        if not (type_ == jnp.int32 or xla_bridge.using_pjrt_c_api()):
+          stack.enter_context(jax.transfer_guard_host_to_device(guard_level))
+        yield
+
+    with _transfer_guard("disallow_explicit"):
+      jax2tf.call_tf(tf_fun)(jax_array_on_gpu)
+
 
 class RoundTripToJaxTest(tf_test_util.JaxToTfTestCase):
   "Reloading output of jax2tf into JAX with call_tf"