Add librispeech_lm text corpus, a companion to the speech+text librispeech corpus.

TensorFlow Datasets Team · copybara-github · commit 9851dff49b26 · 2020-02-21T09:51:28.000-08:00
PiperOrigin-RevId: 296452906
diff --git a/tensorflow_datasets/testing/test_data/fake_examples/librispeech_lm/librispeech-lm-norm.txt.gz b/tensorflow_datasets/testing/test_data/fake_examples/librispeech_lm/librispeech-lm-norm.txt.gz
diff --git a/tensorflow_datasets/text/__init__.py b/tensorflow_datasets/text/__init__.py
@@ -26,6 +26,7 @@
 from tensorflow_datasets.text.glue import Glue
 from tensorflow_datasets.text.imdb import IMDBReviews
 from tensorflow_datasets.text.imdb import IMDBReviewsConfig
+from tensorflow_datasets.text.librispeech_lm import LibrispeechLm
 from tensorflow_datasets.text.lm1b import Lm1b
 from tensorflow_datasets.text.lm1b import Lm1bConfig
 from tensorflow_datasets.text.math_dataset import MathDataset
@@ -44,3 +45,4 @@
 from tensorflow_datasets.text.wikipedia import Wikipedia
 from tensorflow_datasets.text.xnli import Xnli
 from tensorflow_datasets.text.yelp_polarity import YelpPolarityReviews
+
diff --git a/tensorflow_datasets/text/librispeech_lm.py b/tensorflow_datasets/text/librispeech_lm.py
@@ -0,0 +1,78 @@
+# coding=utf-8
+# Copyright 2020 The TensorFlow Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Librispeech language modeling dataset."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow_datasets.public_api as tfds
+
+_CITATION = """\
+@inproceedings{panayotov2015librispeech,
+  title={Librispeech: an ASR corpus based on public domain audio books},
+  author={Panayotov, Vassil and Chen, Guoguo and Povey, Daniel and Khudanpur, Sanjeev},
+  booktitle={Acoustics, Speech and Signal Processing (ICASSP), 2015 IEEE International Conference on},
+  pages={5206--5210},
+  year={2015},
+  organization={IEEE}
+}
+"""
+
+_DESCRIPTION = """\
+Language modeling resources to be used in conjunction with the LibriSpeech ASR corpus.
+"""
+
+_URL = 'http://www.openslr.org/11'
+
+_DL_URL = 'http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz'
+
+
+class LibrispeechLm(tfds.core.GeneratorBasedBuilder):
+  """Librispeech language modeling dataset."""
+
+  VERSION = tfds.core.Version('0.1.0')
+
+  def _info(self):
+    return tfds.core.DatasetInfo(
+        builder=self,
+        description=_DESCRIPTION,
+        features=tfds.features.FeaturesDict({
+            'text': tfds.features.Text(),
+        }),
+        supervised_keys=('text', 'text'),
+        homepage=_URL,
+        citation=_CITATION,
+    )
+
+  def _split_generators(self, dl_manager):
+    """Returns SplitGenerators."""
+    archive_path = dl_manager.download(_DL_URL)
+    return [
+        tfds.core.SplitGenerator(
+            name=tfds.Split.TRAIN,
+            gen_kwargs={'files_iter': dl_manager.iter_archive(archive_path)},
+        ),
+    ]
+
+  def _generate_examples(self, files_iter):
+    """Yields examples."""
+    # The archive contains a single file.
+    _, f = next(files_iter)
+    for key, line in enumerate(f):
+      text = line.strip()
+      if text:  # Skip empty lines.
+        yield key, {'text': text}
diff --git a/tensorflow_datasets/text/librispeech_lm_test.py b/tensorflow_datasets/text/librispeech_lm_test.py
@@ -0,0 +1,36 @@
+# coding=utf-8
+# Copyright 2020 The TensorFlow Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for LibrispeechLm dataset builder."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow_datasets import testing
+from tensorflow_datasets.text import librispeech_lm
+
+
+class LibrispeechLmTest(testing.DatasetBuilderTestCase):
+  DATASET_CLASS = librispeech_lm.LibrispeechLm
+  SPLITS = {
+      "train": 4,  # Number of fake train examples.
+  }
+  DL_DOWNLOAD_RESULT = "librispeech-lm-norm.txt.gz"
+
+
+if __name__ == "__main__":
+  testing.test_main()
+
diff --git a/tensorflow_datasets/url_checksums/librispeech_lm.txt b/tensorflow_datasets/url_checksums/librispeech_lm.txt
@@ -0,0 +1 @@
+http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz 1507274412 927d47656638c2ca93566640e56bcbeb2aa4ca237b8490fe1eff7d6e610a53c7

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz 1507274412 927d47656638c2ca93566640e56bcbeb2aa4ca237b8490fe1eff7d6e610a53c7`