Add support for loading checkpoints with a given layout to the array serialization library

yashk2810 · jax authors · commit 5ce7dca96929 · 2024-04-13T19:35:50.000-07:00
PiperOrigin-RevId: 624596358
diff --git a/jax/experimental/array_serialization/serialization.py b/jax/experimental/array_serialization/serialization.py
@@ -33,6 +33,7 @@
 from jax._src import distributed
 from jax._src import sharding
 from jax._src import sharding_impls
+from jax._src.layout import Layout, DeviceLocalLayout as DLL
 from jax._src import typing
 from jax._src import util
 from jax._src.lib import xla_extension as xe
@@ -306,14 +307,22 @@ def estimate_read_memory_footprint(t: ts.TensorStore,
 
 
 async def async_deserialize(
-    in_sharding: sharding_impls.XLACompatibleSharding,
+    in_sharding: sharding_impls.XLACompatibleSharding | Layout,
     tensorstore_spec: ts.Spec | dict[str, Any],
     global_shape: Sequence[int] | None = None,
     dtype=None,
     byte_limiter: _LimitInFlightBytes | None = None,
     context=TS_CONTEXT,
     assume_metadata: bool = False,
 ):
+  in_sharding = (in_sharding.sharding if isinstance(in_sharding, Layout) else  # type: ignore
+                 in_sharding)
+  if not isinstance(in_sharding, sharding_impls.XLACompatibleSharding):
+    raise ValueError(
+        'sharding passed to deserialization should be specified, concrete and'
+        f' an instance of `jax.XLACompatibleSharding`. Got {in_sharding}')
+  dll = (in_sharding.device_local_layout if isinstance(in_sharding, Layout)
+         else None)
   t = await ts.open(
       tensorstore_spec,
       open=True,
@@ -340,7 +349,8 @@ async def cb(index: array.Index, device: jax.Device):
       # Cast while reloading on process to avoid 2 copies on device if the
       # casting is done on device.
       out = out.astype(dtype)
-    result = jax.device_put(out, device)
+    result = jax.device_put(
+        out, Layout(dll, jax.sharding.SingleDeviceSharding(device)))
     if byte_limiter is not None:
       # NB: `out` actually might not be ready for garbage collection by the
       # time we call release_bytes . Thus peak memory usage still might grow
@@ -358,7 +368,7 @@ async def cb(index: array.Index, device: jax.Device):
   return await create_async_array_from_callback(tuple(shape), in_sharding, cb)
 
 
-def run_deserialization(shardings: Sequence[sharding.Sharding],
+def run_deserialization(shardings: Sequence[sharding.Sharding | Layout],
                         tensorstore_specs: Sequence[dict[str, Any]],
                         global_shapes: Sequence[array.Shape] | None = None,
                         dtypes: Sequence[typing.DTypeLike] | None = None,
@@ -596,7 +606,7 @@ def serialize_with_paths(self, arrays: Sequence[jax.Array],
     tspecs = jax.tree.map(get_tensorstore_spec, paths)
     self.serialize(arrays, tspecs, on_commit_callback=on_commit_callback)
 
-  def deserialize(self, shardings: Sequence[sharding.Sharding],
+  def deserialize(self, shardings: Sequence[sharding.Sharding | Layout],
                   tensorstore_specs: Sequence[dict[str, Any]],
                   global_shapes: Sequence[array.Shape] | None = None,
                   dtypes: Sequence[typing.DTypeLike] | None = None,
diff --git a/jax/experimental/array_serialization/serialization_test.py b/jax/experimental/array_serialization/serialization_test.py
@@ -16,6 +16,7 @@
 import asyncio
 import math
 from functools import partial
+import re
 import os
 import pathlib
 import tracemalloc as tm
@@ -28,6 +29,7 @@
 from jax.sharding import NamedSharding, GSPMDSharding
 from jax.sharding import PartitionSpec as P
 from jax.experimental.array_serialization import serialization
+from jax.experimental.layout import Layout, DeviceLocalLayout as DLL
 import numpy as np
 import tensorstore as ts
 
@@ -45,6 +47,13 @@ def tearDownModule():
   prev_xla_flags()
 
 
+pattern = re.compile(r"\{(.*?):")
+
+def extract_minor_to_major(l):
+  match = re.search(pattern, str(l))
+  return tuple(int(i) for i in match.groups()[0].split(','))
+
+
 class CheckpointTest(jtu.JaxTestCase):
 
   def _on_commit_callback(self, temp_ckpt_dir, final_ckpt_dir):
@@ -411,5 +420,38 @@ def test_maybe_cloud_storage(self):
     }
     self.assertTrue(serialization.is_remote_storage(nested_tspec))
 
+  def test_load_with_layout(self):
+    if not jtu.test_device_matches(['tpu']):
+      self.skipTest('Layouts are only supported on TPUs')
+
+    mesh = jtu.create_global_mesh((4, 2), ('x', 'y'))
+    np_inp = np.arange(32).reshape(8, 4)
+    s = NamedSharding(mesh, P('x', 'y'))
+    arr = jax.device_put(np_inp, s)
+
+    out_layout = jax.jit(lambda x: x.T, out_shardings=Layout(DLL.AUTO)).lower(
+        arr).compile().output_layouts()
+    self.assertEqual(extract_minor_to_major(arr.layout),
+                     extract_minor_to_major(out_layout)[::-1])
+
+    ckpt_dir = pathlib.Path(self.create_tempdir('ckpt').full_path)
+    ckpt_path = pathlib.Path(self.create_tempdir(f'{ckpt_dir}/first').full_path)
+    tspecs = jax.tree_util.tree_map(serialization.get_tensorstore_spec, [ckpt_path])
+
+    manager = serialization.GlobalAsyncCheckpointManager()
+    manager.serialize(
+        [arr], tspecs,
+        on_commit_callback=partial(self._on_commit_callback, ckpt_dir, ckpt_dir))
+    manager.wait_until_finished()
+
+    out, = serialization.run_deserialization([out_layout], tspecs)
+
+    self.assertEqual(out.layout, out_layout)
+    self.assertIsInstance(out, array.ArrayImpl)
+    self.assertArraysEqual(out, np_inp)
+    for s in out.addressable_shards:
+      self.assertArraysEqual(s.data, np_inp[s.index])
+
+
 if __name__ == '__main__':
   absltest.main(testLoader=jtu.JaxTestLoader())