tensorflow
diff --git a/‎docs/add_dataset.md
Lines changed: 10 additions & 11 deletions b/‎docs/add_dataset.md
Lines changed: 10 additions & 11 deletions
diff --git a/‎docs/beam_datasets.md
Lines changed: 3 additions & 1 deletion b/‎docs/beam_datasets.md
Lines changed: 3 additions & 1 deletion
diff --git a/‎docs/datasets_versioning.md
Lines changed: 2 additions & 3 deletions b/‎docs/datasets_versioning.md
Lines changed: 2 additions & 3 deletions
diff --git a/‎tensorflow_datasets/audio/groove.py
Lines changed: 5 additions & 6 deletions b/‎tensorflow_datasets/audio/groove.py
Lines changed: 5 additions & 6 deletions
diff --git a/‎tensorflow_datasets/audio/librispeech.py
Lines changed: 4 additions & 2 deletions b/‎tensorflow_datasets/audio/librispeech.py
Lines changed: 4 additions & 2 deletions
diff --git a/‎tensorflow_datasets/audio/nsynth.py
Lines changed: 2 additions & 1 deletion b/‎tensorflow_datasets/audio/nsynth.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎tensorflow_datasets/core/dataset_builder_beam_test.py
Lines changed: 26 additions & 6 deletions b/‎tensorflow_datasets/core/dataset_builder_beam_test.py
Lines changed: 26 additions & 6 deletions
diff --git a/‎tensorflow_datasets/core/dataset_builder_test.py
Lines changed: 23 additions & 23 deletions b/‎tensorflow_datasets/core/dataset_builder_test.py
Lines changed: 23 additions & 23 deletions
@@ -103,7 +103,8 @@ Its subclasses implement:
   [`DatasetInfo`](api_docs/python/tfds/core/DatasetInfo.md) object
   describing the dataset
 * `_split_generators`: downloads the source data and defines the dataset splits
-* `_generate_examples`: yields examples in the dataset from the source data
+* `_generate_examples`: yields `(key, example)` tuples in the dataset from the
+  source data
 
 This guide will use `GeneratorBasedBuilder`.
 
@@ -131,7 +132,7 @@ class MyDataset(tfds.core.GeneratorBasedBuilder):
 
   def _generate_examples(self):
     # Yields examples from the dataset
-    pass  # TODO
+    yield 'key', {}
 ```
 
 If you'd like to follow a test-driven development workflow, which can help you
@@ -229,15 +230,13 @@ through [`tfds.Split.subsplit`](splits.md#subsplit).
     return [
         tfds.core.SplitGenerator(
             name=tfds.Split.TRAIN,
-            num_shards=10,
             gen_kwargs={
                 "images_dir_path": os.path.join(extracted_path, "train"),
                 "labels": os.path.join(extracted_path, "train_labels.csv"),
             },
         ),
         tfds.core.SplitGenerator(
             name=tfds.Split.TEST,
-            num_shards=1,
             gen_kwargs={
                 "images_dir_path": os.path.join(extracted_path, "test"),
                 "labels": os.path.join(extracted_path, "test_labels.csv"),
@@ -250,10 +249,6 @@ through [`tfds.Split.subsplit`](splits.md#subsplit).
 will be passed as keyword arguments to `_generate_examples`, which we'll define
 next.
 
-When specifying `num_shards`, which determines how many files the split will
-use, pick a number such that a single shard is less that 4 GiB as
-as each shard will be loaded in memory for shuffling.
-
 ## Writing an example generator
 
 `_generate_examples` generates the examples for each split from the
@@ -268,8 +263,8 @@ builder._generate_examples(
 ```
 
 This method will typically read source dataset artifacts (e.g. a CSV file) and
-yield feature dictionaries that correspond to the features specified in
-`DatasetInfo`.
+yield (key, feature dictionary) tuples that correspond to the features specified
+in `DatasetInfo`.
 
 ```python
 def _generate_examples(self, images_dir_path, labels):
@@ -281,7 +276,7 @@ def _generate_examples(self, images_dir_path, labels):
 
   # And yield examples as feature dictionaries
   for image_id, description, label in data:
-    yield {
+    yield image_id, {
         "image_description": description,
         "image": "%s/%s.jpeg" % (images_dir_path, image_id),
         "label": label,
@@ -293,6 +288,10 @@ format suitable for writing to disk (currently we use `tf.train.Example`
 protocol buffers). For example, `tfds.features.Image` will copy out the
 JPEG content of the passed image files automatically.
 
+The key (here: `image_id`) should uniquely identify the record. It is used to
+shuffle the dataset globally. If two records are yielded using the same key,
+an exception will be raised during preparation of the dataset.
+
 If you've implemented the test harness, your builder test should now pass.
 
 ### File access and `tf.io.gfile`
 
@@ -67,7 +67,9 @@ a look at the
 ```python
 class DummyBeamDataset(tfds.core.BeamBasedBuilder):
 
-  VERSION = tfds.core.Version('1.0.0')
+  # BeamBasedBuilder does not support S3 yet.
+  VERSION = tfds.core.Version(
+      '1.0.0', experiments={tfds.core.Experiment.S3: False})
 
   def _info(self):
     return tfds.core.DatasetInfo(
 
@@ -111,7 +111,6 @@ class MNIST(tfds.core.GeneratorBasedBuilder):
   VERSION = tfds.core.Version("1.0.0")
   SUPPORTED_VERSIONS = [
       tfds.core.Version("2.0.0", experiments={tfds.core.Experiment.S3: True}),
-      tfds.core.Version("1.0.0"),
   ]
   # Version history:
   # 2.0.0: S3 (new shuffling, sharding and slicing mechanism).
@@ -123,10 +122,10 @@ definition would then look like:
 
 ```py
 class MNIST(tfds.core.GeneratorBasedBuilder):
-  VERSION = tfds.core.Version("2.0.0")
+  VERSION = tfds.core.Version("1.0.0",
+                              experiments={tfds.core.Experiment.S3: False})
   SUPPORTED_VERSIONS = [
       tfds.core.Version("2.0.0"),
-      tfds.core.Version("1.0.0", experiments={tfds.core.Experiment.S3: False}),
   ]
   # Version history:
   # 2.0.0: S3 (new shuffling, sharding and slicing mechanism), order of records
 
@@ -77,7 +77,11 @@ def __init__(self, split_bars=None, include_audio=True, audio_rate=16000,
     else:
       name_parts.append("midionly")
 
-    super(GrooveConfig, self).__init__(name="-".join(name_parts), **kwargs)
+    super(GrooveConfig, self).__init__(
+        name="-".join(name_parts),
+        version=tfds.core.Version(
+            "1.0.0", experiments={tfds.core.Experiment.S3: False}),
+        **kwargs)
     self.split_bars = split_bars
     self.include_audio = include_audio
     self.audio_rate = audio_rate
@@ -89,30 +93,25 @@ class Groove(tfds.core.GeneratorBasedBuilder):
   BUILDER_CONFIGS = [
       GrooveConfig(
           include_audio=False,
-          version="1.0.0",
           description="Groove dataset without audio, unsplit."
       ),
       GrooveConfig(
           include_audio=True,
-          version="1.0.0",
           description="Groove dataset with audio, unsplit."
       ),
       GrooveConfig(
           include_audio=False,
           split_bars=2,
-          version="1.0.0",
           description="Groove dataset without audio, split into 2-bar chunks."
       ),
       GrooveConfig(
           include_audio=True,
           split_bars=2,
-          version="1.0.0",
           description="Groove dataset with audio, split into 2-bar chunks."
       ),
       GrooveConfig(
           include_audio=False,
           split_bars=4,
-          version="1.0.0",
           description="Groove dataset without audio, split into 4-bar chunks."
       ),
   ]
 
@@ -149,12 +149,14 @@ def _make_builder_configs():
           encoder_cls=tfds.features.text.SubwordTextEncoder,
           vocab_size=2**15),
   ]
-  version = "0.1.0"
   configs = []
   for text_encoder_config in text_encoder_configs:
     for data in _DATA_OPTIONS:
       config = LibrispeechConfig(
-          version=version, text_encoder_config=text_encoder_config, data=data)
+          version=tfds.core.Version(
+              "0.0.1", experiments={tfds.core.Experiment.S3: False}),
+          text_encoder_config=text_encoder_config,
+          data=data)
       configs.append(config)
   return configs
 
 
@@ -87,7 +87,8 @@
 class Nsynth(tfds.core.GeneratorBasedBuilder):
   """A large-scale and high-quality dataset of annotated musical notes."""
 
-  VERSION = tfds.core.Version("1.0.0")
+  VERSION = tfds.core.Version("1.0.0",
+                              experiments={tfds.core.Experiment.S3: False})
 
   def _info(self):
     return tfds.core.DatasetInfo(
 
@@ -39,7 +39,7 @@
 
 class DummyBeamDataset(dataset_builder.BeamBasedBuilder):
 
-  VERSION = utils.Version("1.0.0")
+  VERSION = utils.Version("1.0.0", experiments={utils.Experiment.S3: False})
 
   def _info(self):
 
@@ -86,6 +86,11 @@ def _gen_example(x):
   }
 
 
+class FaultyS3DummyBeamDataset(DummyBeamDataset):
+
+  VERSION = utils.Version("1.0.0")
+
+
 class BeamBasedBuilderTest(testing.TestCase):
 
   def test_download_prepare_raise(self):
@@ -147,20 +152,35 @@ def _assertElemsAllEqual(self, nested_lhs, nested_rhs):
         self.assertAllEqual(lhs, rhs)
 
 
-  # The default beam pipeline do not works with Python2
-  def test_download_prepare(self):
-
+  def _get_dl_config_if_need_to_run(self):
+    # The default beam pipeline do not works with Python2
     # TODO(b/129148632): The current apache-beam 2.11.0 do not work with Py3
     # Update once the new version is out (around April)
     skip_beam_test = bool(six.PY3)
     if skip_beam_test:
       return
-
-    dl_config = download.DownloadConfig(
+    return download.DownloadConfig(
         beam_options=beam.options.pipeline_options.PipelineOptions(),
     )
+
+  def test_download_prepare(self):
+    dl_config = self._get_dl_config_if_need_to_run()
+    if not dl_config:
+      return
     self._assertBeamGeneration(dl_config)
 
+  def test_s3_raise(self):
+    dl_config = self._get_dl_config_if_need_to_run()
+    if not dl_config:
+      return
+    dl_config.compute_stats = download.ComputeStatsMode.SKIP
+    with testing.tmp_dir(self.get_temp_dir()) as tmp_dir:
+      builder = FaultyS3DummyBeamDataset(data_dir=tmp_dir)
+      builder.download_and_prepare(download_config=dl_config)
+      with self.assertRaisesWithPredicateMatch(
+          AssertionError, "`DatasetInfo.SplitInfo.num_shards` is empty"):
+        builder.as_dataset()
+
 
 if __name__ == "__main__":
   testing.test_main()
@@ -52,13 +52,13 @@ class DummyDatasetWithConfigs(dataset_builder.GeneratorBasedBuilder):
   BUILDER_CONFIGS = [
       DummyBuilderConfig(
           name="plus1",
-          version="0.0.1",
+          version=utils.Version("0.0.1"),
           description="Add 1 to the records",
           increment=1),
       DummyBuilderConfig(
           name="plus2",
-          version="0.0.2",
-          supported_versions=["0.0.1"],
+          version=utils.Version("0.0.2"),
+          supported_versions=[utils.Version("0.0.1")],
           description="Add 2 to the records",
           increment=2),
   ]
@@ -70,12 +70,10 @@ def _split_generators(self, dl_manager):
     return [
         splits_lib.SplitGenerator(
             name=splits_lib.Split.TRAIN,
-            num_shards=2,
             gen_kwargs={"range_": range(20)},
         ),
         splits_lib.SplitGenerator(
             name=splits_lib.Split.TEST,
-            num_shards=1,
             gen_kwargs={"range_": range(20, 30)},
         ),
     ]
@@ -90,9 +88,10 @@ def _info(self):
 
   def _generate_examples(self, range_):
     for i in range_:
+      x = i
       if self.builder_config:
-        i += self.builder_config.increment
-      yield {"x": i}
+        x += self.builder_config.increment
+      yield i, {"x": x}
 
 
 class InvalidSplitDataset(DummyDatasetWithConfigs):
@@ -143,8 +142,8 @@ def test_determinism(self):
       # deterministically generated.
       self.assertEqual(
           [e["x"] for e in ds_values],
-          [16, 1, 2, 3, 10, 17, 0, 11, 14, 7, 4, 9, 18, 15, 8, 19, 6, 13, 12,
-           5],
+          [6, 16, 19, 12, 14, 18, 5, 13, 15, 4, 10, 17, 0, 8, 3, 1, 9, 7, 11,
+           2],
       )
 
   @testing.run_in_graph_and_eager_modes()
@@ -153,7 +152,7 @@ def test_multi_split(self):
       ds_train, ds_test = registered.load(
           name="dummy_dataset_shared_generator",
           data_dir=tmp_dir,
-          split=[splits_lib.Split.TRAIN, splits_lib.Split.TEST],
+          split=["train", "test"],
           as_dataset_kwargs=dict(shuffle_files=False))
 
       data = list(dataset_utils.as_numpy(ds_train))
@@ -220,12 +219,12 @@ def test_with_configs(self):
       # Test that subdirectories were created per config
       self.assertTrue(tf.io.gfile.exists(data_dir1))
       self.assertTrue(tf.io.gfile.exists(data_dir2))
-      # 2 train shards, 1 test shard, plus metadata files
-      self.assertGreater(len(tf.io.gfile.listdir(data_dir1)), 3)
-      self.assertGreater(len(tf.io.gfile.listdir(data_dir2)), 3)
+      # 1 train shard, 1 test shard, plus metadata files
+      self.assertGreater(len(tf.io.gfile.listdir(data_dir1)), 2)
+      self.assertGreater(len(tf.io.gfile.listdir(data_dir2)), 2)
 
       # Test that the config was used and they didn't collide.
-      splits_list = [splits_lib.Split.TRAIN, splits_lib.Split.TEST]
+      splits_list = ["train", "test"]
       for builder, incr in [(builder1, 1), (builder2, 2)]:
         train_data, test_data = [   # pylint: disable=g-complex-comprehension
             [el["x"] for el in   # pylint: disable=g-complex-comprehension
@@ -301,23 +300,24 @@ def load_mnist_dataset_info(self):
   def test_stats_restored_from_gcs(self):
     with testing.tmp_dir(self.get_temp_dir()) as tmp_dir:
       builder = testing.DummyMnist(data_dir=tmp_dir)
-      self.assertEqual(builder.info.splits.total_num_examples, 70000)
+      self.assertEqual(builder.info.splits.total_num_examples, 40)
       self.assertFalse(self.compute_dynamic_property.called)
 
       builder.download_and_prepare()
 
       # Statistics shouldn't have been recomputed
-      self.assertEqual(builder.info.splits.total_num_examples, 70000)
+      self.assertEqual(builder.info.splits.total_num_examples, 40)
       self.assertFalse(self.compute_dynamic_property.called)
 
   def test_stats_not_restored_gcs_overwritten(self):
     with testing.tmp_dir(self.get_temp_dir()) as tmp_dir:
       # If split are different that the one restored, stats should be recomputed
-      builder = testing.DummyMnist(data_dir=tmp_dir, num_shards=5)
-      self.assertEqual(builder.info.splits.total_num_examples, 70000)
+      builder = testing.DummyMnist(data_dir=tmp_dir)
+      self.assertEqual(builder.info.splits.total_num_examples, 40)
       self.assertFalse(self.compute_dynamic_property.called)
 
-      builder.download_and_prepare()
+      dl_config = download.DownloadConfig(max_examples_per_split=5)
+      builder.download_and_prepare(download_config=dl_config)
 
       # Statistics should have been recomputed (split different from the
       # restored ones)
@@ -347,7 +347,7 @@ def test_skip_stats(self):
     self.patch_gcs.stop()
     with testing.tmp_dir(self.get_temp_dir()) as tmp_dir:
       # No dataset_info restored, so stats are empty
-      builder = testing.DummyMnist(data_dir=tmp_dir, num_shards=5)
+      builder = testing.DummyMnist(data_dir=tmp_dir)
       self.assertEqual(builder.info.splits.total_num_examples, 0)
       self.assertFalse(self.compute_dynamic_property.called)
 
@@ -366,8 +366,8 @@ def test_force_stats(self):
 
     with testing.tmp_dir(self.get_temp_dir()) as tmp_dir:
       # No dataset_info restored, so stats are empty
-      builder = testing.DummyMnist(data_dir=tmp_dir, num_shards=5)
-      self.assertEqual(builder.info.splits.total_num_examples, 70000)
+      builder = testing.DummyMnist(data_dir=tmp_dir)
+      self.assertEqual(builder.info.splits.total_num_examples, 40)
       self.assertFalse(self.compute_dynamic_property.called)
 
       download_config = download.DownloadConfig(
@@ -433,7 +433,7 @@ def test_all_splits(self):
   @testing.run_in_graph_and_eager_modes()
   def test_with_batch_size(self):
     items = list(dataset_utils.as_numpy(self.builder.as_dataset(
-        split=splits_lib.Split.TRAIN + splits_lib.Split.TEST, batch_size=10)))
+        split="train+test", batch_size=10)))
     # 3 batches of 10
     self.assertEqual(3, len(items))
     x1, x2, x3 = items[0]["x"], items[1]["x"], items[2]["x"]