Create working librispeech tfds config, parallelizing example creation using beam.

TensorFlow Datasets Team · copybara-github · commit c79105b01b07 · 2020-02-21T06:57:50.000-08:00
Simplifies the splits by creating one for each of the 7 splits in the official release (dev-clean, test-clean, dev-other,  test-other, train-clean-100, train-clean-360, train-other-500).  The result is more flexible and consistent with the way the dataset is (AFAIK) used in the literature, where it is most common to train on the union of all 3 training sets (or occasionally only on train-clean-100) and evaluate separately on each of the clean/other dev/test sets.

This means that there are no single "train", "test", or "validation" splits, but it makes it easy for users to compose different subsets as desired using the tfds Split API.

PiperOrigin-RevId: 296426388
diff --git a/tensorflow_datasets/audio/librispeech.py b/tensorflow_datasets/audio/librispeech.py
@@ -19,7 +19,6 @@
 from __future__ import division
 from __future__ import print_function
 
-import collections
 import os
 
 import tensorflow.compat.v2 as tf
@@ -38,9 +37,9 @@
 """
 
 _DESCRIPTION = """\
-LibriSpeech is a corpus of approximately 1000 hours of read English speech of frequency 16 KHz, 
+LibriSpeech is a corpus of approximately 1000 hours of read English speech with sampling rate of 16 kHz,
 prepared by Vassil Panayotov with the assistance of Daniel Povey. The data is derived from read
-audiobooks from the LibriVox project, and has been carefully segmented and aligned.
+audiobooks from the LibriVox project, and has been carefully segmented and aligned.87
 """
 
 _URL = "http://www.openslr.org/12"
@@ -54,91 +53,49 @@
     "train_clean360": _DL_URL + "train-clean-360.tar.gz",
     "train_other500": _DL_URL + "train-other-500.tar.gz",
 }
-_DATA_OPTIONS = ["clean100", "clean360", "all"]
 
 
-# TODO(tfds): Better support compositional configuration
 class LibrispeechConfig(tfds.core.BuilderConfig):
   """BuilderConfig for Librispeech."""
 
   @tfds.core.disallow_positional_args
-  def __init__(self, text_encoder_config=None, data="clean100", **kwargs):
+  def __init__(self, text_encoder_config=None, **kwargs):
     """Constructs a LibrispeechConfig.
 
     Args:
       text_encoder_config: `tfds.features.text.TextEncoderConfig`, configuration
         for the `tfds.features.text.TextEncoder` used for the text feature.
-      data: `str`, one of `(clean100, clean360, all)`. `clean100` uses only the
-        clean data without `train-clean-360`. `clean360` uses clean data with
-        `train-clean-360`. `all` uses all the data.
       **kwargs: keyword arguments forwarded to super.
     """
-    if data not in _DATA_OPTIONS:
-      raise ValueError("data must be one of %s" % _DATA_OPTIONS)
     name = kwargs.get("name")
     if name is None:
-      encoder_name = (
-          text_encoder_config.name if text_encoder_config else "plain_text")
-      data_name = data
-      name = "%s_%s" % (data_name, encoder_name)
+      name = (text_encoder_config.name if text_encoder_config else "plain_text")
     kwargs["name"] = name
 
     description = kwargs.get("description")
     if description is None:
       if text_encoder_config:
-        encoder_description = "Transcriptions use the %s" % (
+        description = "Transcriptions use the %s" % (
             text_encoder_config.encoder_cls.__name__)
       else:
-        encoder_description = "Transcriptions are in plain text."
-
-      if data == "all":
-        data_description = "Uses all data."
-      else:
-        data_description = ("Uses only clean data,%s including train-clean-360."
-                            % ("" if data == "clean360" else " not"))
-
-      description = "%s %s" % (data_description, encoder_description)
+        description = "Transcriptions are in plain text."
     kwargs["description"] = description
 
     super(LibrispeechConfig, self).__init__(**kwargs)
     self.text_encoder_config = text_encoder_config
-    self.data = data
-
-  @property
-  def download_urls(self):
-    """Returns download urls for this config."""
-    urls = {
-        tfds.Split.TRAIN: ["train_clean100"],
-        tfds.Split.VALIDATION: ["dev_clean"],
-        tfds.Split.TEST: ["test_clean"],
-    }
-    if self.data in ["all", "clean360"]:
-      urls[tfds.Split.TRAIN].append("train_clean360")
-    if self.data == "all":
-      urls[tfds.Split.TRAIN].extend(["train_clean360", "train_other500"])
-      urls[tfds.Split.VALIDATION].append("dev_other")
-      urls[tfds.Split.TEST].append("test_other")
-
-    urls = {
-        split: [_DL_URLS[name] for name in names
-               ] for split, names in urls.items()
-    }
-    return urls
 
 
 def _make_builder_configs():
   """Make built-in Librispeech BuilderConfigs.
 
-  Uses 4 text encodings (plain text, bytes, subwords with 8k vocab, subwords
-  with 32k vocab) crossed with the data subsets (clean100, clean360, all).
+  Uses 3 text encodings (plain_text, subwords with 8k vocab, subwords with 32k
+  vocab).
 
   Returns:
     `list<tfds.audio.LibrispeechConfig>`
   """
   text_encoder_configs = [
       None,
-      tfds.features.text.TextEncoderConfig(
-          name="bytes", encoder=tfds.features.text.ByteTextEncoder()),
       tfds.features.text.TextEncoderConfig(
           name="subwords8k",
           encoder_cls=tfds.features.text.SubwordTextEncoder,
@@ -149,27 +106,19 @@ def _make_builder_configs():
           vocab_size=2**15),
   ]
   configs = []
-  v001 = tfds.core.Version(
-      "0.0.1", experiments={tfds.core.Experiment.S3: False})
-  v1 = tfds.core.Version(
-      "1.0.0", "New split API (https://tensorflow.org/datasets/splits)")
   for text_encoder_config in text_encoder_configs:
-    for data in _DATA_OPTIONS:
-      config = LibrispeechConfig(
-          version=v1, supported_versions=[v001],
-          text_encoder_config=text_encoder_config,
-          data=data)
-      configs.append(config)
+    config = LibrispeechConfig(
+        version=tfds.core.Version("1.1.0"),
+        text_encoder_config=text_encoder_config)
+    configs.append(config)
   return configs
 
 
-class Librispeech(tfds.core.GeneratorBasedBuilder):
+class Librispeech(tfds.core.BeamBasedBuilder):
   """Librispeech dataset."""
 
   BUILDER_CONFIGS = _make_builder_configs()
 
-  IN_DEVELOPMENT = True
-
   def _info(self):
     return tfds.core.DatasetInfo(
         builder=self,
@@ -184,87 +133,57 @@ def _info(self):
                 tf.int64,
             "chapter_id":
                 tf.int64,
+            "id":
+                tf.string,
         }),
         supervised_keys=("speech", "text"),
         homepage=_URL,
         citation=_CITATION,
     )
 
   def _vocab_text_gen(self, dirs):
-    for unused_key, example in self._generate_examples(dirs):
-      yield example["text"]
+    for directory in dirs:
+      for _, example in _generate_librispeech_examples(directory):
+        yield example["text"]
 
   def _split_generators(self, dl_manager):
-    extracted_dirs = dl_manager.download_and_extract(
-        self.builder_config.download_urls)
-    # Generate vocabulary from training data if SubwordTextEncoder configured
-    self.info.features["text"].maybe_build_from_corpus(
-        self._vocab_text_gen(extracted_dirs[tfds.Split.TRAIN]))
-
-    return [
-        tfds.core.SplitGenerator(
-            name=tfds.Split.TRAIN,
-            num_shards=100,
-            gen_kwargs={
-                "dirs": extracted_dirs[tfds.Split.TRAIN],
-            }),
-        tfds.core.SplitGenerator(
-            name=tfds.Split.VALIDATION,
-            num_shards=10,
-            gen_kwargs={
-                "dirs": extracted_dirs[tfds.Split.VALIDATION],
-            }),
-        tfds.core.SplitGenerator(
-            name=tfds.Split.TEST,
-            num_shards=10,
-            gen_kwargs={
-                "dirs": extracted_dirs[tfds.Split.TEST],
-            }),
+    extracted_dirs = dl_manager.download_and_extract(_DL_URLS)
+    # Generate vocabulary from training data if SubwordTextEncoder configured.
+    all_train_dirs = [
+        v for k, v in extracted_dirs.items() if k.startswith("train")
     ]
-
-  def _generate_examples(self, dirs):
-    for directory in dirs:
-      for example in _walk_librispeech_dir(directory):
-        record = {
-            "speech": example.audio_file,
-            "text": example.transcript,
-            "speaker_id": example.speaker_id,
-            "chapter_id": example.chapter_id,
-        }
-        yield "%s/%s" % (example.speaker_id, example.chapter_id), record
-
-
-LibrispeechExample = collections.namedtuple(
-    "_LibrispeechExample",
-    ["speaker_id", "chapter_id", "audio_file", "transcript"])
-
-
-def _walk_librispeech_dir(directory):
-  """Walk a Librispeech directory and yield examples."""
-  directory = os.path.join(directory, "LibriSpeech")
-  for path, _, files in tf.io.gfile.walk(directory):
-    if not files:
-      continue
-
-    transcript_file = [f for f in files if f.endswith(".txt")]
-    if not transcript_file:
-      continue
-    assert len(transcript_file) == 1
-    transcript_file, = transcript_file
-    transcripts = {}
+    self.info.features["text"].maybe_build_from_corpus(
+        self._vocab_text_gen(all_train_dirs))
+
+    splits = [tfds.core.SplitGenerator(name=k, gen_kwargs={"directory": v})
+              for k, v in extracted_dirs.items()]
+    return splits
+
+  def _build_pcollection(self, pipeline, directory):
+    """Generates examples as dicts."""
+    beam = tfds.core.lazy_imports.apache_beam
+    return (pipeline
+            | beam.Create([directory])
+            | beam.FlatMap(_generate_librispeech_examples)
+            | beam.Reshuffle())
+
+
+def _generate_librispeech_examples(directory):
+  """Generate examples from a Librispeech directory."""
+  transcripts_glob = os.path.join(directory, "LibriSpeech", "*/*/*/*.txt")
+  for transcript_file in tf.io.gfile.glob(transcripts_glob):
+    path = os.path.dirname(transcript_file)
     with tf.io.gfile.GFile(os.path.join(path, transcript_file)) as f:
       for line in f:
         line = line.strip()
         key, transcript = line.split(" ", 1)
-        transcripts[key] = transcript
-    audio_files = [f for f in files if not f.endswith(".txt")]
-    for audio_file in audio_files:
-      assert audio_file.endswith(".flac")
-      key = audio_file[:-len(".flac")]
-      transcript = transcripts[key]
-      speaker_id, chapter_id = [int(el) for el in key.split("-")[:2]]
-      yield LibrispeechExample(
-          speaker_id=speaker_id,
-          chapter_id=chapter_id,
-          audio_file=os.path.join(path, audio_file),
-          transcript=transcript)
+        audio_file = "%s.flac" % key
+        speaker_id, chapter_id = [int(el) for el in key.split("-")[:2]]
+        example = {
+            "id": key,
+            "speaker_id": speaker_id,
+            "chapter_id": chapter_id,
+            "speech": os.path.join(path, audio_file),
+            "text": transcript
+        }
+        yield key, example
diff --git a/tensorflow_datasets/audio/librispeech_test.py b/tensorflow_datasets/audio/librispeech_test.py
@@ -21,48 +21,30 @@
 
 from tensorflow_datasets import testing
 from tensorflow_datasets.audio import librispeech
-import tensorflow_datasets.public_api as tfds
 
 
-class LibrispeechTest100(testing.DatasetBuilderTestCase):
+class LibrispeechTest(testing.DatasetBuilderTestCase):
   DATASET_CLASS = librispeech.Librispeech
-  BUILDER_CONFIG_NAMES_TO_TEST = ["clean100_plain_text", "clean100_bytes"]
+  BUILDER_CONFIG_NAMES_TO_TEST = ["plain_text", "subwords8k"]
   SPLITS = {
-      "train": 2,
-      "validation": 2,
-      "test": 2,
+      "train_clean100": 2,
+      "train_clean360": 2,
+      "train_other500": 2,
+      "test_clean": 2,
+      "test_other": 2,
+      "dev_clean": 2,
+      "dev_other": 2,
   }
-
   DL_EXTRACT_RESULT = {
-      tfds.Split.TRAIN: ["train-clean-100"],
-      tfds.Split.TEST: ["test-clean"],
-      tfds.Split.VALIDATION: ["dev-clean"],
+      "train_clean100": "train-clean-100",
+      "train_clean360": "train-clean-360",
+      "train_other500": "train-other-500",
+      "test_clean": "test-clean",
+      "test_other": "test-other",
+      "dev_clean": "dev-clean",
+      "dev_other": "dev-other",
   }
 
 
-class LibrispeechTest100S3(LibrispeechTest100):
-  VERSION = "experimental_latest"
-
-
-class LibrispeechTest360(testing.DatasetBuilderTestCase):
-  DATASET_CLASS = librispeech.Librispeech
-  BUILDER_CONFIG_NAMES_TO_TEST = ["clean360_plain_text"]
-  SPLITS = {
-      "train": 4,
-      "validation": 2,
-      "test": 2,
-  }
-
-  DL_EXTRACT_RESULT = {
-      tfds.Split.TRAIN: ["train-clean-100", "train-clean-360"],
-      tfds.Split.TEST: ["test-clean"],
-      tfds.Split.VALIDATION: ["dev-clean"],
-  }
-
-
-class LibrispeechTest360S3(LibrispeechTest360):
-  VERSION = "experimental_latest"
-
-
 if __name__ == "__main__":
   testing.test_main()
diff --git a/tensorflow_datasets/url_checksums/librispeech.txt b/tensorflow_datasets/url_checksums/librispeech.txt
@@ -0,0 +1,7 @@
+http://www.openslr.org/resources/12/dev-clean.tar.gz 337926286 76f87d090650617fca0cac8f88b9416e0ebf80350acb97b343a85fa903728ab3
+http://www.openslr.org/resources/12/dev-other.tar.gz 314305928 12661c48e8c3fe1de2c1caa4c3e135193bfb1811584f11f569dd12645aa84365
+http://www.openslr.org/resources/12/test-clean.tar.gz 346663984 39fde525e59672dc6d1551919b1478f724438a95aa55f874b576be21967e6c23
+http://www.openslr.org/resources/12/test-other.tar.gz 328757843 d09c181bba5cf717b3dee7d4d592af11a3ee3a09e08ae025c5506f6ebe961c29
+http://www.openslr.org/resources/12/train-clean-100.tar.gz 6387309499 d4ddd1d5a6ab303066f14971d768ee43278a5f2a0aa43dc716b0e64ecbbbf6e2
+http://www.openslr.org/resources/12/train-clean-360.tar.gz 23049477885 146a56496217e96c14334a160df97fffedd6e0a04e66b9c5af0d40be3c792ecf
+http://www.openslr.org/resources/12/train-other-500.tar.gz 30593501606 ddb22f27f96ec163645d53215559df6aa36515f26e01dd70798188350adcb6d2