Add in_memory option to load and as_dataset for small datasets

Ryan Sepassi · copybara-github · commit e0cad3d6f5f1 · 2019-05-21T11:55:33.000-07:00
Defaults to loading small datasets (&lt;1GB) in memory.
Note that to benefit from this, tfds.load should be called just once and the dataset that's returned should be reused.

PiperOrigin-RevId: 249295830
diff --git a/tensorflow_datasets/core/dataset_builder.py b/tensorflow_datasets/core/dataset_builder.py
@@ -275,7 +275,8 @@ def as_dataset(self,
                  split=None,
                  batch_size=1,
                  shuffle_files=None,
-                 as_supervised=False):
+                 as_supervised=False,
+                 in_memory=None):
     """Constructs a `tf.data.Dataset`.
 
     Callers must pass arguments as keyword arguments.
@@ -297,6 +298,12 @@ def as_dataset(self,
         `builder.info.supervised_keys`. If `False`, the default,
         the returned `tf.data.Dataset` will have a dictionary with all the
         features.
+      in_memory: `bool`, if `True`, loads the dataset in memory which
+        increases iteration speeds. Note that if `True` and the dataset has
+        unknown dimensions, the features will be padded to the maximum
+        size across the dataset. By default (when `None`), will load the
+        dataset in memory if the size is <1GB and all feature dimensions are
+        statically known.
 
     Returns:
       `tf.data.Dataset`, or if `split=None`, `dict<key: tfds.Split, value:
@@ -322,12 +329,13 @@ def as_dataset(self,
         shuffle_files=shuffle_files,
         batch_size=batch_size,
         as_supervised=as_supervised,
+        in_memory=in_memory,
     )
     datasets = utils.map_nested(build_single_dataset, split, map_tuple=True)
     return datasets
 
   def _build_single_dataset(self, split, shuffle_files, batch_size,
-                            as_supervised):
+                            as_supervised, in_memory):
     """as_dataset for a single split."""
     if isinstance(split, six.string_types):
       split = splits_lib.Split(split)
@@ -341,10 +349,39 @@ def _build_single_dataset(self, split, shuffle_files, batch_size,
       batch_size = self.info.splits.total_num_examples or sys.maxsize
 
     dataset = self._as_dataset(split=split, shuffle_files=shuffle_files)
+
+    # If the dataset is small, load it in memory
+    # TODO(tfds): Expose and use the actual data size on disk and rm the manual
+    # name guards. size_in_bytes is the download size, which is misleading,
+    # particularly for datasets that use manual_dir as well as some downloads
+    # (wmt and diabetic_retinopathy_detection).
+    dataset_shape_is_fully_defined = (
+        dataset_utils.dataset_shape_is_fully_defined(dataset))
+    in_memory_default = (
+        self.info.size_in_bytes and
+        self.info.size_in_bytes <= 1e9 and
+        not self.name.startswith("wmt") and
+        not self.name.startswith("diabetic") and
+        dataset_shape_is_fully_defined)
+    in_memory = in_memory_default if in_memory is None else in_memory
+    if in_memory and not wants_full_dataset:
+      # TODO(tfds): Enable in_memory without padding features. May be able
+      # to do by using a requested version of tf.data.Dataset.cache that can
+      # persist a cache beyond iterator instances.
+      if not dataset_shape_is_fully_defined:
+        tf.logging.warning("Called in_memory=True on a dataset that does not "
+                           "have fully defined shapes. Note that features with "
+                           "variable length dimensions will be 0-padded to "
+                           "the maximum length across the dataset.")
+      # Use padded_batch so that features with unknown shape are supported.
+      full_bs = self.info.splits.total_num_examples or sys.maxsize
+      dataset = dataset.padded_batch(full_bs, dataset.output_shapes)
+      dataset = tf.data.Dataset.from_tensor_slices(
+          next(dataset_utils.as_numpy(dataset)))
+
     if batch_size > 1:
       # Use padded_batch so that features with unknown shape are supported.
-      padded_shapes = self.info.features.shape
-      dataset = dataset.padded_batch(batch_size, padded_shapes)
+      dataset = dataset.padded_batch(batch_size, dataset.output_shapes)
 
     if as_supervised:
       if not self.info.supervised_keys:
diff --git a/tensorflow_datasets/core/dataset_builder_test.py b/tensorflow_datasets/core/dataset_builder_test.py
@@ -414,6 +414,13 @@ def tearDownClass(cls):
   def setUp(self):
     self.builder = DummyDatasetSharedGenerator(data_dir=self._tfds_tmp_dir)
 
+  @testing.run_in_graph_and_eager_modes()
+  def test_in_memory(self):
+    train_data = dataset_utils.as_numpy(
+        self.builder.as_dataset(split="train", in_memory=True))
+    train_data = [el for el in train_data]
+    self.assertEqual(20, len(train_data))
+
   @testing.run_in_graph_and_eager_modes()
   def test_all_splits(self):
     splits = dataset_utils.as_numpy(
diff --git a/tensorflow_datasets/core/dataset_utils.py b/tensorflow_datasets/core/dataset_utils.py
@@ -163,8 +163,13 @@ def _eager_dataset_iterator(dataset):
     yield tf.nest.pack_sequence_as(item, flat)
 
 
-def _graph_dataset_iterator(ds_item, graph=None):
+def _graph_dataset_iterator(ds_iter, graph=None):
+  """Constructs a Python generator from a tf.data.Iterator."""
+  with utils.maybe_with_graph(graph, create_if_none=False):
+    init = ds_iter.initializer
+    ds_item = ds_iter.get_next()
   with utils.nogpu_session(graph) as sess:
+    sess.run(init)
     while True:
       try:
         yield sess.run(ds_item)
@@ -219,7 +224,7 @@ def as_numpy(dataset, graph=None):
     # First create iterators for datasets
     with utils.maybe_with_graph(graph, create_if_none=False):
       ds_iters = [
-          tf.compat.v1.data.make_one_shot_iterator(ds_el).get_next()
+          tf.compat.v1.data.make_initializable_iterator(ds_el)
           for ds_el in flat_ds if tf_compat.is_dataset(ds_el)
       ]
     ds_iters = [_graph_dataset_iterator(ds_iter, graph) for ds_iter in ds_iters]
@@ -240,3 +245,8 @@ def as_numpy(dataset, graph=None):
 
   # Nest
   return tf.nest.pack_sequence_as(nested_ds, flat_np)
+
+
+def dataset_shape_is_fully_defined(ds):
+  return all(
+      [ts.is_fully_defined() for ts in tf.nest.flatten(ds.output_shapes)])
diff --git a/tensorflow_datasets/core/registered.py b/tensorflow_datasets/core/registered.py
@@ -177,6 +177,7 @@ def load(name,
          split=None,
          data_dir=None,
          batch_size=1,
+         in_memory=None,
          download=True,
          as_supervised=False,
          with_info=False,
@@ -231,6 +232,12 @@ def load(name,
     batch_size: `int`, set to > 1 to get batches of examples. Note that
       variable length features will be 0-padded. If
       `batch_size=-1`, will return the full dataset as `tf.Tensor`s.
+    in_memory: `bool`, if `True`, loads the dataset in memory which
+      increases iteration speeds. Note that if `True` and the dataset has
+      unknown dimensions, the features will be padded to the maximum
+      size across the dataset. By default (when `None`), will load the
+      dataset in memory if the size is <1GB and all feature dimensions are
+      statically known.
     download: `bool` (optional), whether to call
       `tfds.core.DatasetBuilder.download_and_prepare`
       before calling `tf.DatasetBuilder.as_dataset`. If `False`, data is
@@ -290,6 +297,7 @@ def load(name,
   as_dataset_kwargs["split"] = split
   as_dataset_kwargs["as_supervised"] = as_supervised
   as_dataset_kwargs["batch_size"] = batch_size
+  as_dataset_kwargs["in_memory"] = in_memory
 
   ds = dbuilder.as_dataset(**as_dataset_kwargs)
   if with_info:
diff --git a/tensorflow_datasets/core/registered_test.py b/tensorflow_datasets/core/registered_test.py
@@ -122,6 +122,7 @@ def test_load(self):
                      builder.as_dataset_kwargs.pop("split"))
     self.assertEqual(1, builder.as_dataset_kwargs.pop("batch_size"))
     self.assertFalse(builder.as_dataset_kwargs.pop("as_supervised"))
+    self.assertIsNone(builder.as_dataset_kwargs.pop("in_memory"))
     self.assertEqual(builder.as_dataset_kwargs, as_dataset_kwargs)
     self.assertEqual(dict(data_dir=data_dir, k1=1), builder.kwargs)