Merge pull request #651 from ChanchalKumarMaji:improve-usability-1

copybara-github · copybara-github · commit 80ba498aa43f · 2019-06-10T17:09:57.000-07:00
PiperOrigin-RevId: 252517354
diff --git a/tensorflow_datasets/core/dataset_builder.py b/tensorflow_datasets/core/dataset_builder.py
@@ -308,7 +308,7 @@ def download_and_prepare(self, download_dir=None, download_config=None):
   @api_utils.disallow_positional_args
   def as_dataset(self,
                  split=None,
-                 batch_size=1,
+                 batch_size=None,
                  shuffle_files=None,
                  as_supervised=False):
     """Constructs a `tf.data.Dataset`.
@@ -320,8 +320,8 @@ def as_dataset(self,
         (default), returns all splits in a dict
         `<key: tfds.Split, value: tf.data.Dataset>`.
       batch_size: `int`, batch size. Note that variable-length features will
-        be 0-padded if `batch_size > 1`. Users that want more custom behavior
-        should use `batch_size=1` and use the `tf.data` API to construct a
+        be 0-padded if `batch_size` is set. Users that want more custom behavior
+        should use `batch_size=None` and use the `tf.data` API to construct a
         custom pipeline. If `batch_size == -1`, will return feature
         dictionaries of the whole dataset with `tf.Tensor`s instead of a
         `tf.data.Dataset`.
@@ -376,10 +376,9 @@ def _build_single_dataset(self, split, shuffle_files, batch_size,
       batch_size = self.info.splits.total_num_examples or sys.maxsize
 
     dataset = self._as_dataset(split=split, shuffle_files=shuffle_files)
-    if batch_size > 1:
+    if batch_size:
       # Use padded_batch so that features with unknown shape are supported.
-      padded_shapes = self.info.features.shape
-      dataset = dataset.padded_batch(batch_size, padded_shapes)
+      dataset = dataset.padded_batch(batch_size, dataset.output_shapes)
 
     if as_supervised:
       if not self.info.supervised_keys:
diff --git a/tensorflow_datasets/core/dataset_builder_test.py b/tensorflow_datasets/core/dataset_builder_test.py
@@ -228,7 +228,7 @@ def test_with_configs(self):
       splits_list = [splits_lib.Split.TRAIN, splits_lib.Split.TEST]
       for builder, incr in [(builder1, 1), (builder2, 2)]:
         train_data, test_data = [   # pylint: disable=g-complex-comprehension
-            [el["x"] for el in
+            [el["x"] for el in   # pylint: disable=g-complex-comprehension
              dataset_utils.as_numpy(builder.as_dataset(split=split))]
             for split in splits_list
         ]
@@ -424,6 +424,16 @@ def test_with_batch_size(self):
     self.assertEqual(10, x3.shape[0])
     self.assertEqual(sum(range(30)), int(x1.sum() + x2.sum() + x3.sum()))
 
+    # By default batch_size is None and won't add a batch dimension
+    ds = self.builder.as_dataset(split=splits_lib.Split.TRAIN)
+    self.assertEqual(0, len(ds.output_shapes["x"]))
+    # Setting batch_size=1 will add an extra batch dimension
+    ds = self.builder.as_dataset(split=splits_lib.Split.TRAIN, batch_size=1)
+    self.assertEqual(1, len(ds.output_shapes["x"]))
+    # Setting batch_size=2 will add an extra batch dimension
+    ds = self.builder.as_dataset(split=splits_lib.Split.TRAIN, batch_size=2)
+    self.assertEqual(1, len(ds.output_shapes["x"]))
+
   @testing.run_in_graph_and_eager_modes()
   def test_supervised_keys(self):
     x, _ = dataset_utils.as_numpy(self.builder.as_dataset(
diff --git a/tensorflow_datasets/core/registered.py b/tensorflow_datasets/core/registered.py
@@ -176,7 +176,7 @@ def builder(name, **builder_init_kwargs):
 def load(name,
          split=None,
          data_dir=None,
-         batch_size=1,
+         batch_size=None,
          download=True,
          as_supervised=False,
          with_info=False,
@@ -228,7 +228,7 @@ def load(name,
       `tfds.Split.TEST`).
     data_dir: `str` (optional), directory to read/write data.
       Defaults to "~/tensorflow_datasets".
-    batch_size: `int`, set to > 1 to get batches of examples. Note that
+    batch_size: `int`, if set, add a batch dimension to examples. Note that
       variable length features will be 0-padded. If
       `batch_size=-1`, will return the full dataset as `tf.Tensor`s.
     download: `bool` (optional), whether to call
diff --git a/tensorflow_datasets/core/registered_test.py b/tensorflow_datasets/core/registered_test.py
@@ -120,7 +120,7 @@ def test_load(self):
     self.assertFalse(builder.download_called)
     self.assertEqual(splits.Split.TEST,
                      builder.as_dataset_kwargs.pop("split"))
-    self.assertEqual(1, builder.as_dataset_kwargs.pop("batch_size"))
+    self.assertEqual(None, builder.as_dataset_kwargs.pop("batch_size"))
     self.assertFalse(builder.as_dataset_kwargs.pop("as_supervised"))
     self.assertEqual(builder.as_dataset_kwargs, as_dataset_kwargs)
     self.assertEqual(dict(data_dir=data_dir, k1=1), builder.kwargs)
@@ -131,6 +131,16 @@ def test_load(self):
     self.assertTrue(builder.as_dataset_called)
     self.assertTrue(builder.download_called)
 
+    # Tests for different batch_size
+    # By default batch_size=None
+    builder = registered.load(
+        name=name, split=splits.Split.TEST, data_dir=data_dir)
+    self.assertEqual(None, builder.as_dataset_kwargs.pop("batch_size"))
+    # Setting batch_size=1
+    builder = registered.load(
+        name=name, split=splits.Split.TEST, data_dir=data_dir,
+        batch_size=1)
+
   def test_load_all_splits(self):
     name = "empty_dataset_builder"
     # EmptyDatasetBuilder returns self from as_dataset