Skip to content

Commit c79105b

Browse files
TensorFlow Datasets Teamcopybara-github
TensorFlow Datasets Team
authored andcommitted
Create working librispeech tfds config, parallelizing example creation using beam.
Simplifies the splits by creating one for each of the 7 splits in the official release (dev-clean, test-clean, dev-other, test-other, train-clean-100, train-clean-360, train-other-500). The result is more flexible and consistent with the way the dataset is (AFAIK) used in the literature, where it is most common to train on the union of all 3 training sets (or occasionally only on train-clean-100) and evaluate separately on each of the clean/other dev/test sets. This means that there are no single "train", "test", or "validation" splits, but it makes it easy for users to compose different subsets as desired using the tfds Split API. PiperOrigin-RevId: 296426388
1 parent f679fc1 commit c79105b

File tree

3 files changed

+76
-168
lines changed

3 files changed

+76
-168
lines changed

tensorflow_datasets/audio/librispeech.py

Lines changed: 53 additions & 134 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919
from __future__ import division
2020
from __future__ import print_function
2121

22-
import collections
2322
import os
2423

2524
import tensorflow.compat.v2 as tf
@@ -38,9 +37,9 @@
3837
"""
3938

4039
_DESCRIPTION = """\
41-
LibriSpeech is a corpus of approximately 1000 hours of read English speech of frequency 16 KHz,
40+
LibriSpeech is a corpus of approximately 1000 hours of read English speech with sampling rate of 16 kHz,
4241
prepared by Vassil Panayotov with the assistance of Daniel Povey. The data is derived from read
43-
audiobooks from the LibriVox project, and has been carefully segmented and aligned.
42+
audiobooks from the LibriVox project, and has been carefully segmented and aligned.87
4443
"""
4544

4645
_URL = "http://www.openslr.org/12"
@@ -54,91 +53,49 @@
5453
"train_clean360": _DL_URL + "train-clean-360.tar.gz",
5554
"train_other500": _DL_URL + "train-other-500.tar.gz",
5655
}
57-
_DATA_OPTIONS = ["clean100", "clean360", "all"]
5856

5957

60-
# TODO(tfds): Better support compositional configuration
6158
class LibrispeechConfig(tfds.core.BuilderConfig):
6259
"""BuilderConfig for Librispeech."""
6360

6461
@tfds.core.disallow_positional_args
65-
def __init__(self, text_encoder_config=None, data="clean100", **kwargs):
62+
def __init__(self, text_encoder_config=None, **kwargs):
6663
"""Constructs a LibrispeechConfig.
6764
6865
Args:
6966
text_encoder_config: `tfds.features.text.TextEncoderConfig`, configuration
7067
for the `tfds.features.text.TextEncoder` used for the text feature.
71-
data: `str`, one of `(clean100, clean360, all)`. `clean100` uses only the
72-
clean data without `train-clean-360`. `clean360` uses clean data with
73-
`train-clean-360`. `all` uses all the data.
7468
**kwargs: keyword arguments forwarded to super.
7569
"""
76-
if data not in _DATA_OPTIONS:
77-
raise ValueError("data must be one of %s" % _DATA_OPTIONS)
7870
name = kwargs.get("name")
7971
if name is None:
80-
encoder_name = (
81-
text_encoder_config.name if text_encoder_config else "plain_text")
82-
data_name = data
83-
name = "%s_%s" % (data_name, encoder_name)
72+
name = (text_encoder_config.name if text_encoder_config else "plain_text")
8473
kwargs["name"] = name
8574

8675
description = kwargs.get("description")
8776
if description is None:
8877
if text_encoder_config:
89-
encoder_description = "Transcriptions use the %s" % (
78+
description = "Transcriptions use the %s" % (
9079
text_encoder_config.encoder_cls.__name__)
9180
else:
92-
encoder_description = "Transcriptions are in plain text."
93-
94-
if data == "all":
95-
data_description = "Uses all data."
96-
else:
97-
data_description = ("Uses only clean data,%s including train-clean-360."
98-
% ("" if data == "clean360" else " not"))
99-
100-
description = "%s %s" % (data_description, encoder_description)
81+
description = "Transcriptions are in plain text."
10182
kwargs["description"] = description
10283

10384
super(LibrispeechConfig, self).__init__(**kwargs)
10485
self.text_encoder_config = text_encoder_config
105-
self.data = data
106-
107-
@property
108-
def download_urls(self):
109-
"""Returns download urls for this config."""
110-
urls = {
111-
tfds.Split.TRAIN: ["train_clean100"],
112-
tfds.Split.VALIDATION: ["dev_clean"],
113-
tfds.Split.TEST: ["test_clean"],
114-
}
115-
if self.data in ["all", "clean360"]:
116-
urls[tfds.Split.TRAIN].append("train_clean360")
117-
if self.data == "all":
118-
urls[tfds.Split.TRAIN].extend(["train_clean360", "train_other500"])
119-
urls[tfds.Split.VALIDATION].append("dev_other")
120-
urls[tfds.Split.TEST].append("test_other")
121-
122-
urls = {
123-
split: [_DL_URLS[name] for name in names
124-
] for split, names in urls.items()
125-
}
126-
return urls
12786

12887

12988
def _make_builder_configs():
13089
"""Make built-in Librispeech BuilderConfigs.
13190
132-
Uses 4 text encodings (plain text, bytes, subwords with 8k vocab, subwords
133-
with 32k vocab) crossed with the data subsets (clean100, clean360, all).
91+
Uses 3 text encodings (plain_text, subwords with 8k vocab, subwords with 32k
92+
vocab).
13493
13594
Returns:
13695
`list<tfds.audio.LibrispeechConfig>`
13796
"""
13897
text_encoder_configs = [
13998
None,
140-
tfds.features.text.TextEncoderConfig(
141-
name="bytes", encoder=tfds.features.text.ByteTextEncoder()),
14299
tfds.features.text.TextEncoderConfig(
143100
name="subwords8k",
144101
encoder_cls=tfds.features.text.SubwordTextEncoder,
@@ -149,27 +106,19 @@ def _make_builder_configs():
149106
vocab_size=2**15),
150107
]
151108
configs = []
152-
v001 = tfds.core.Version(
153-
"0.0.1", experiments={tfds.core.Experiment.S3: False})
154-
v1 = tfds.core.Version(
155-
"1.0.0", "New split API (https://tensorflow.org/datasets/splits)")
156109
for text_encoder_config in text_encoder_configs:
157-
for data in _DATA_OPTIONS:
158-
config = LibrispeechConfig(
159-
version=v1, supported_versions=[v001],
160-
text_encoder_config=text_encoder_config,
161-
data=data)
162-
configs.append(config)
110+
config = LibrispeechConfig(
111+
version=tfds.core.Version("1.1.0"),
112+
text_encoder_config=text_encoder_config)
113+
configs.append(config)
163114
return configs
164115

165116

166-
class Librispeech(tfds.core.GeneratorBasedBuilder):
117+
class Librispeech(tfds.core.BeamBasedBuilder):
167118
"""Librispeech dataset."""
168119

169120
BUILDER_CONFIGS = _make_builder_configs()
170121

171-
IN_DEVELOPMENT = True
172-
173122
def _info(self):
174123
return tfds.core.DatasetInfo(
175124
builder=self,
@@ -184,87 +133,57 @@ def _info(self):
184133
tf.int64,
185134
"chapter_id":
186135
tf.int64,
136+
"id":
137+
tf.string,
187138
}),
188139
supervised_keys=("speech", "text"),
189140
homepage=_URL,
190141
citation=_CITATION,
191142
)
192143

193144
def _vocab_text_gen(self, dirs):
194-
for unused_key, example in self._generate_examples(dirs):
195-
yield example["text"]
145+
for directory in dirs:
146+
for _, example in _generate_librispeech_examples(directory):
147+
yield example["text"]
196148

197149
def _split_generators(self, dl_manager):
198-
extracted_dirs = dl_manager.download_and_extract(
199-
self.builder_config.download_urls)
200-
# Generate vocabulary from training data if SubwordTextEncoder configured
201-
self.info.features["text"].maybe_build_from_corpus(
202-
self._vocab_text_gen(extracted_dirs[tfds.Split.TRAIN]))
203-
204-
return [
205-
tfds.core.SplitGenerator(
206-
name=tfds.Split.TRAIN,
207-
num_shards=100,
208-
gen_kwargs={
209-
"dirs": extracted_dirs[tfds.Split.TRAIN],
210-
}),
211-
tfds.core.SplitGenerator(
212-
name=tfds.Split.VALIDATION,
213-
num_shards=10,
214-
gen_kwargs={
215-
"dirs": extracted_dirs[tfds.Split.VALIDATION],
216-
}),
217-
tfds.core.SplitGenerator(
218-
name=tfds.Split.TEST,
219-
num_shards=10,
220-
gen_kwargs={
221-
"dirs": extracted_dirs[tfds.Split.TEST],
222-
}),
150+
extracted_dirs = dl_manager.download_and_extract(_DL_URLS)
151+
# Generate vocabulary from training data if SubwordTextEncoder configured.
152+
all_train_dirs = [
153+
v for k, v in extracted_dirs.items() if k.startswith("train")
223154
]
224-
225-
def _generate_examples(self, dirs):
226-
for directory in dirs:
227-
for example in _walk_librispeech_dir(directory):
228-
record = {
229-
"speech": example.audio_file,
230-
"text": example.transcript,
231-
"speaker_id": example.speaker_id,
232-
"chapter_id": example.chapter_id,
233-
}
234-
yield "%s/%s" % (example.speaker_id, example.chapter_id), record
235-
236-
237-
LibrispeechExample = collections.namedtuple(
238-
"_LibrispeechExample",
239-
["speaker_id", "chapter_id", "audio_file", "transcript"])
240-
241-
242-
def _walk_librispeech_dir(directory):
243-
"""Walk a Librispeech directory and yield examples."""
244-
directory = os.path.join(directory, "LibriSpeech")
245-
for path, _, files in tf.io.gfile.walk(directory):
246-
if not files:
247-
continue
248-
249-
transcript_file = [f for f in files if f.endswith(".txt")]
250-
if not transcript_file:
251-
continue
252-
assert len(transcript_file) == 1
253-
transcript_file, = transcript_file
254-
transcripts = {}
155+
self.info.features["text"].maybe_build_from_corpus(
156+
self._vocab_text_gen(all_train_dirs))
157+
158+
splits = [tfds.core.SplitGenerator(name=k, gen_kwargs={"directory": v})
159+
for k, v in extracted_dirs.items()]
160+
return splits
161+
162+
def _build_pcollection(self, pipeline, directory):
163+
"""Generates examples as dicts."""
164+
beam = tfds.core.lazy_imports.apache_beam
165+
return (pipeline
166+
| beam.Create([directory])
167+
| beam.FlatMap(_generate_librispeech_examples)
168+
| beam.Reshuffle())
169+
170+
171+
def _generate_librispeech_examples(directory):
172+
"""Generate examples from a Librispeech directory."""
173+
transcripts_glob = os.path.join(directory, "LibriSpeech", "*/*/*/*.txt")
174+
for transcript_file in tf.io.gfile.glob(transcripts_glob):
175+
path = os.path.dirname(transcript_file)
255176
with tf.io.gfile.GFile(os.path.join(path, transcript_file)) as f:
256177
for line in f:
257178
line = line.strip()
258179
key, transcript = line.split(" ", 1)
259-
transcripts[key] = transcript
260-
audio_files = [f for f in files if not f.endswith(".txt")]
261-
for audio_file in audio_files:
262-
assert audio_file.endswith(".flac")
263-
key = audio_file[:-len(".flac")]
264-
transcript = transcripts[key]
265-
speaker_id, chapter_id = [int(el) for el in key.split("-")[:2]]
266-
yield LibrispeechExample(
267-
speaker_id=speaker_id,
268-
chapter_id=chapter_id,
269-
audio_file=os.path.join(path, audio_file),
270-
transcript=transcript)
180+
audio_file = "%s.flac" % key
181+
speaker_id, chapter_id = [int(el) for el in key.split("-")[:2]]
182+
example = {
183+
"id": key,
184+
"speaker_id": speaker_id,
185+
"chapter_id": chapter_id,
186+
"speech": os.path.join(path, audio_file),
187+
"text": transcript
188+
}
189+
yield key, example

tensorflow_datasets/audio/librispeech_test.py

Lines changed: 16 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -21,48 +21,30 @@
2121

2222
from tensorflow_datasets import testing
2323
from tensorflow_datasets.audio import librispeech
24-
import tensorflow_datasets.public_api as tfds
2524

2625

27-
class LibrispeechTest100(testing.DatasetBuilderTestCase):
26+
class LibrispeechTest(testing.DatasetBuilderTestCase):
2827
DATASET_CLASS = librispeech.Librispeech
29-
BUILDER_CONFIG_NAMES_TO_TEST = ["clean100_plain_text", "clean100_bytes"]
28+
BUILDER_CONFIG_NAMES_TO_TEST = ["plain_text", "subwords8k"]
3029
SPLITS = {
31-
"train": 2,
32-
"validation": 2,
33-
"test": 2,
30+
"train_clean100": 2,
31+
"train_clean360": 2,
32+
"train_other500": 2,
33+
"test_clean": 2,
34+
"test_other": 2,
35+
"dev_clean": 2,
36+
"dev_other": 2,
3437
}
35-
3638
DL_EXTRACT_RESULT = {
37-
tfds.Split.TRAIN: ["train-clean-100"],
38-
tfds.Split.TEST: ["test-clean"],
39-
tfds.Split.VALIDATION: ["dev-clean"],
39+
"train_clean100": "train-clean-100",
40+
"train_clean360": "train-clean-360",
41+
"train_other500": "train-other-500",
42+
"test_clean": "test-clean",
43+
"test_other": "test-other",
44+
"dev_clean": "dev-clean",
45+
"dev_other": "dev-other",
4046
}
4147

4248

43-
class LibrispeechTest100S3(LibrispeechTest100):
44-
VERSION = "experimental_latest"
45-
46-
47-
class LibrispeechTest360(testing.DatasetBuilderTestCase):
48-
DATASET_CLASS = librispeech.Librispeech
49-
BUILDER_CONFIG_NAMES_TO_TEST = ["clean360_plain_text"]
50-
SPLITS = {
51-
"train": 4,
52-
"validation": 2,
53-
"test": 2,
54-
}
55-
56-
DL_EXTRACT_RESULT = {
57-
tfds.Split.TRAIN: ["train-clean-100", "train-clean-360"],
58-
tfds.Split.TEST: ["test-clean"],
59-
tfds.Split.VALIDATION: ["dev-clean"],
60-
}
61-
62-
63-
class LibrispeechTest360S3(LibrispeechTest360):
64-
VERSION = "experimental_latest"
65-
66-
6749
if __name__ == "__main__":
6850
testing.test_main()
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
http://www.openslr.org/resources/12/dev-clean.tar.gz 337926286 76f87d090650617fca0cac8f88b9416e0ebf80350acb97b343a85fa903728ab3
2+
http://www.openslr.org/resources/12/dev-other.tar.gz 314305928 12661c48e8c3fe1de2c1caa4c3e135193bfb1811584f11f569dd12645aa84365
3+
http://www.openslr.org/resources/12/test-clean.tar.gz 346663984 39fde525e59672dc6d1551919b1478f724438a95aa55f874b576be21967e6c23
4+
http://www.openslr.org/resources/12/test-other.tar.gz 328757843 d09c181bba5cf717b3dee7d4d592af11a3ee3a09e08ae025c5506f6ebe961c29
5+
http://www.openslr.org/resources/12/train-clean-100.tar.gz 6387309499 d4ddd1d5a6ab303066f14971d768ee43278a5f2a0aa43dc716b0e64ecbbbf6e2
6+
http://www.openslr.org/resources/12/train-clean-360.tar.gz 23049477885 146a56496217e96c14334a160df97fffedd6e0a04e66b9c5af0d40be3c792ecf
7+
http://www.openslr.org/resources/12/train-other-500.tar.gz 30593501606 ddb22f27f96ec163645d53215559df6aa36515f26e01dd70798188350adcb6d2

0 commit comments

Comments
 (0)