Skip to content

Commit 9851dff

Browse files
TensorFlow Datasets Teamcopybara-github
TensorFlow Datasets Team
authored andcommitted
Add librispeech_lm text corpus, a companion to the speech+text librispeech corpus.
PiperOrigin-RevId: 296452906
1 parent c79105b commit 9851dff

File tree

5 files changed

+117
-0
lines changed

5 files changed

+117
-0
lines changed

tensorflow_datasets/text/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
from tensorflow_datasets.text.glue import Glue
2727
from tensorflow_datasets.text.imdb import IMDBReviews
2828
from tensorflow_datasets.text.imdb import IMDBReviewsConfig
29+
from tensorflow_datasets.text.librispeech_lm import LibrispeechLm
2930
from tensorflow_datasets.text.lm1b import Lm1b
3031
from tensorflow_datasets.text.lm1b import Lm1bConfig
3132
from tensorflow_datasets.text.math_dataset import MathDataset
@@ -44,3 +45,4 @@
4445
from tensorflow_datasets.text.wikipedia import Wikipedia
4546
from tensorflow_datasets.text.xnli import Xnli
4647
from tensorflow_datasets.text.yelp_polarity import YelpPolarityReviews
48+
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
# coding=utf-8
2+
# Copyright 2020 The TensorFlow Datasets Authors.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
"""Librispeech language modeling dataset."""
17+
18+
from __future__ import absolute_import
19+
from __future__ import division
20+
from __future__ import print_function
21+
22+
import tensorflow_datasets.public_api as tfds
23+
24+
_CITATION = """\
25+
@inproceedings{panayotov2015librispeech,
26+
title={Librispeech: an ASR corpus based on public domain audio books},
27+
author={Panayotov, Vassil and Chen, Guoguo and Povey, Daniel and Khudanpur, Sanjeev},
28+
booktitle={Acoustics, Speech and Signal Processing (ICASSP), 2015 IEEE International Conference on},
29+
pages={5206--5210},
30+
year={2015},
31+
organization={IEEE}
32+
}
33+
"""
34+
35+
_DESCRIPTION = """\
36+
Language modeling resources to be used in conjunction with the LibriSpeech ASR corpus.
37+
"""
38+
39+
_URL = 'http://www.openslr.org/11'
40+
41+
_DL_URL = 'http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz'
42+
43+
44+
class LibrispeechLm(tfds.core.GeneratorBasedBuilder):
45+
"""Librispeech language modeling dataset."""
46+
47+
VERSION = tfds.core.Version('0.1.0')
48+
49+
def _info(self):
50+
return tfds.core.DatasetInfo(
51+
builder=self,
52+
description=_DESCRIPTION,
53+
features=tfds.features.FeaturesDict({
54+
'text': tfds.features.Text(),
55+
}),
56+
supervised_keys=('text', 'text'),
57+
homepage=_URL,
58+
citation=_CITATION,
59+
)
60+
61+
def _split_generators(self, dl_manager):
62+
"""Returns SplitGenerators."""
63+
archive_path = dl_manager.download(_DL_URL)
64+
return [
65+
tfds.core.SplitGenerator(
66+
name=tfds.Split.TRAIN,
67+
gen_kwargs={'files_iter': dl_manager.iter_archive(archive_path)},
68+
),
69+
]
70+
71+
def _generate_examples(self, files_iter):
72+
"""Yields examples."""
73+
# The archive contains a single file.
74+
_, f = next(files_iter)
75+
for key, line in enumerate(f):
76+
text = line.strip()
77+
if text: # Skip empty lines.
78+
yield key, {'text': text}
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
# coding=utf-8
2+
# Copyright 2020 The TensorFlow Datasets Authors.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
"""Tests for LibrispeechLm dataset builder."""
17+
18+
from __future__ import absolute_import
19+
from __future__ import division
20+
from __future__ import print_function
21+
22+
from tensorflow_datasets import testing
23+
from tensorflow_datasets.text import librispeech_lm
24+
25+
26+
class LibrispeechLmTest(testing.DatasetBuilderTestCase):
27+
DATASET_CLASS = librispeech_lm.LibrispeechLm
28+
SPLITS = {
29+
"train": 4, # Number of fake train examples.
30+
}
31+
DL_DOWNLOAD_RESULT = "librispeech-lm-norm.txt.gz"
32+
33+
34+
if __name__ == "__main__":
35+
testing.test_main()
36+
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz 1507274412 927d47656638c2ca93566640e56bcbeb2aa4ca237b8490fe1eff7d6e610a53c7

0 commit comments

Comments
 (0)