Skip to content

Commit 41493ec

Browse files
ronwcopybara-github
authored andcommitted
Add libritts audio corpus, a variant of librispeech targeted at text-to-speech synthesis applications, to tfds.
The data format is very similar to librispeech, so this change is mostly copy-pasted with only minor modifications. All fake_examples are branched from the librispeech fake_examples with modifications to match LibriTTS formatting: 1. convert audio format from flac to wav 2. include original and normalized transcripts in TSV format. 3. use _ instead of - in filenames 4. append an extra number to each wav file name. PiperOrigin-RevId: 297179707
1 parent 3f77980 commit 41493ec

File tree

39 files changed

+323
-0
lines changed

39 files changed

+323
-0
lines changed

tensorflow_datasets/audio/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,5 +18,6 @@
1818
from tensorflow_datasets.audio.groove import Groove
1919
from tensorflow_datasets.audio.librispeech import Librispeech
2020
from tensorflow_datasets.audio.librispeech import LibrispeechConfig
21+
from tensorflow_datasets.audio.libritts import Libritts
2122
from tensorflow_datasets.audio.nsynth import Nsynth
2223
from tensorflow_datasets.audio.speech_commands import SpeechCommands

tensorflow_datasets/audio/libritts.py

Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
# coding=utf-8
2+
# Copyright 2020 The TensorFlow Datasets Authors.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
"""LibriTTS dataset."""
17+
18+
from __future__ import absolute_import
19+
from __future__ import division
20+
from __future__ import print_function
21+
22+
import os
23+
24+
import tensorflow.compat.v2 as tf
25+
26+
import tensorflow_datasets.public_api as tfds
27+
28+
_CITATION = """\
29+
@inproceedings{zen2019libritts,
30+
title = {LibriTTS: A Corpus Derived from LibriSpeech for Text-to-Speech},
31+
author = {H. Zen and V. Dang and R. Clark and Y. Zhang and R. J. Weiss and Y. Jia and Z. Chen and Y. Wu},
32+
booktitle = {Proc. Interspeech},
33+
month = sep,
34+
year = {2019},
35+
doi = {10.21437/Interspeech.2019-2441},
36+
}
37+
"""
38+
39+
_DESCRIPTION = """\
40+
LibriTTS is a multi-speaker English corpus of approximately 585 hours of read
41+
English speech at 24kHz sampling rate, prepared by Heiga Zen with the assistance
42+
of Google Speech and Google Brain team members. The LibriTTS corpus is designed
43+
for TTS research. It is derived from the original materials (mp3 audio files
44+
from LibriVox and text files from Project Gutenberg) of the LibriSpeech corpus.
45+
The main differences from the LibriSpeech corpus are listed below:
46+
47+
1. The audio files are at 24kHz sampling rate.
48+
2. The speech is split at sentence breaks.
49+
3. Both original and normalized texts are included.
50+
4. Contextual information (e.g., neighbouring sentences) can be extracted.
51+
5. Utterances with significant background noise are excluded.
52+
"""
53+
54+
_URL = "http://www.openslr.org/60"
55+
_DL_URL = "http://www.openslr.org/resources/60/"
56+
_DL_URLS = {
57+
"dev_clean": _DL_URL + "dev-clean.tar.gz",
58+
"dev_other": _DL_URL + "dev-other.tar.gz",
59+
"test_clean": _DL_URL + "test-clean.tar.gz",
60+
"test_other": _DL_URL + "test-other.tar.gz",
61+
"train_clean100": _DL_URL + "train-clean-100.tar.gz",
62+
"train_clean360": _DL_URL + "train-clean-360.tar.gz",
63+
"train_other500": _DL_URL + "train-other-500.tar.gz",
64+
}
65+
66+
67+
class Libritts(tfds.core.BeamBasedBuilder):
68+
"""LibriTTS dataset."""
69+
70+
VERSION = tfds.core.Version("1.0.0")
71+
72+
def _info(self):
73+
return tfds.core.DatasetInfo(
74+
builder=self,
75+
description=_DESCRIPTION,
76+
features=tfds.features.FeaturesDict({
77+
"speech": tfds.features.Audio(),
78+
"text_original": tfds.features.Text(),
79+
"text_normalized": tfds.features.Text(),
80+
"speaker_id": tf.int64,
81+
"chapter_id": tf.int64,
82+
"id": tf.string,
83+
}),
84+
supervised_keys=("text_normalized", "speech"),
85+
homepage=_URL,
86+
citation=_CITATION,
87+
metadata=tfds.core.MetadataDict(sample_rate=24000,),
88+
)
89+
90+
def _populate_metadata(self, dirs):
91+
# All dirs contain the same metadata.
92+
directory = list(dirs.values())[0]
93+
94+
speaker_info = {}
95+
path = os.path.join(directory, "LibriTTS/speakers.tsv")
96+
with tf.io.gfile.GFile(path) as f:
97+
for n, line in enumerate(f):
98+
# Skip the first line which is just a header.
99+
if n == 0:
100+
continue
101+
fields = line.strip().split("\t")
102+
if len(fields) == 3:
103+
# Some lines are missing the final field, so leave it blank.
104+
fields.append("")
105+
id_str, gender, subset, name = fields
106+
speaker_info[int(id_str)] = {
107+
"gender": gender,
108+
"subset": subset,
109+
"name": name,
110+
}
111+
self.info.metadata["speakers"] = speaker_info
112+
113+
def _split_generators(self, dl_manager):
114+
extracted_dirs = dl_manager.download_and_extract(_DL_URLS)
115+
self._populate_metadata(extracted_dirs)
116+
splits = [tfds.core.SplitGenerator(name=k, gen_kwargs={"directory": v})
117+
for k, v in extracted_dirs.items()]
118+
return splits
119+
120+
def _build_pcollection(self, pipeline, directory):
121+
"""Generates examples as dicts."""
122+
beam = tfds.core.lazy_imports.apache_beam
123+
return (pipeline
124+
| beam.Create([directory])
125+
| beam.FlatMap(_generate_libritts_examples)
126+
| beam.Reshuffle())
127+
128+
129+
def _generate_libritts_examples(directory):
130+
"""Generate examples from a LibriTTS directory."""
131+
transcripts_glob = os.path.join(directory, "LibriTTS", "*/*/*/*.trans.tsv")
132+
for transcript_file in tf.io.gfile.glob(transcripts_glob):
133+
path = os.path.dirname(transcript_file)
134+
with tf.io.gfile.GFile(os.path.join(path, transcript_file)) as f:
135+
for line in f:
136+
key, text_original, text_normalized = line.split("\t")
137+
audio_file = "%s.wav" % key
138+
speaker_id, chapter_id = [int(el) for el in key.split("_")[:2]]
139+
example = {
140+
"speech": os.path.join(path, audio_file),
141+
"text_normalized": text_normalized,
142+
"text_original": text_original,
143+
"speaker_id": speaker_id,
144+
"chapter_id": chapter_id,
145+
"id": key,
146+
}
147+
yield key, example
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
# coding=utf-8
2+
# Copyright 2020 The TensorFlow Datasets Authors.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
"""Tests for libritts dataset module."""
17+
18+
from __future__ import absolute_import
19+
from __future__ import division
20+
from __future__ import print_function
21+
22+
from tensorflow_datasets import testing
23+
from tensorflow_datasets.audio import libritts
24+
25+
26+
class LibriTTSTest(testing.DatasetBuilderTestCase):
27+
DATASET_CLASS = libritts.Libritts
28+
SPLITS = {
29+
"train_clean100": 2,
30+
"train_clean360": 2,
31+
"train_other500": 2,
32+
"test_clean": 2,
33+
"test_other": 2,
34+
"dev_clean": 2,
35+
"dev_other": 2,
36+
}
37+
DL_EXTRACT_RESULT = {
38+
"train_clean100": "train-clean-100",
39+
"train_clean360": "train-clean-360",
40+
"train_other500": "train-other-500",
41+
"test_clean": "test-clean",
42+
"test_other": "test-other",
43+
"dev_clean": "dev-clean",
44+
"dev_other": "dev-other",
45+
}
46+
47+
48+
if __name__ == "__main__":
49+
testing.test_main()
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
11_01_0000_000 GO DO YOU HEAR go do you hear
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
12_02_0000_000 FORGOTTEN TOO THE NAME OF GILLIAN THE LOVELY CAPTIVE forgotten too the name of gillian the lovely captive
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
READER GENDER SUBSET NAME
2+
11 F dev-clean Wolverine
3+
12 M dev-clean Hulk
4+
13 F dev-other Zimmer
5+
14 M dev-other Carla
6+
15 F test-clean Groot
7+
16 M test-clean Tony
8+
17 F test-other Anita
9+
18 M test-other John
10+
19 F train-clean-100 Denny
11+
20 M train-clean-100 Sean
12+
21 F train-clean-360 Kristin
13+
22 M train-clean-360 Linton
14+
23 F train-other-500 Annie
15+
24 M train-other-500 Martin
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
13_03_0000_000 GO DO YOU HEAR go do you hear
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
14_04_0000_000 FORGOTTEN TOO THE NAME OF GILLIAN THE LOVELY CAPTIVE forgotten too the name of gillian the lovely captive
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
READER GENDER SUBSET NAME
2+
11 F dev-clean Wolverine
3+
12 M dev-clean Hulk
4+
13 F dev-other Zimmer
5+
14 M dev-other Carla
6+
15 F test-clean Groot
7+
16 M test-clean Tony
8+
17 F test-other Anita
9+
18 M test-other John
10+
19 F train-clean-100 Denny
11+
20 M train-clean-100 Sean
12+
21 F train-clean-360 Kristin
13+
22 M train-clean-360 Linton
14+
23 F train-other-500 Annie
15+
24 M train-other-500 Martin
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
READER GENDER SUBSET NAME
2+
11 F dev-clean Wolverine
3+
12 M dev-clean Hulk
4+
13 F dev-other Zimmer
5+
14 M dev-other Carla
6+
15 F test-clean Groot
7+
16 M test-clean Tony
8+
17 F test-other Anita
9+
18 M test-other John
10+
19 F train-clean-100 Denny
11+
20 M train-clean-100 Sean
12+
21 F train-clean-360 Kristin
13+
22 M train-clean-360 Linton
14+
23 F train-other-500 Annie
15+
24 M train-other-500 Martin
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
15_05_0000_000 HE BEGAN A CONFUSED COMPLAINT AGAINST THE WIZARD WHO HAD VANISHED BEHIND THE CURTAIN ON THE LEFT he began a confused complaint against the wizard who had vanished behind the curtain on the left
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
16_06_0000_000 ALSO A POPULAR CONTRIVANCE WHEREBY LOVE MAKING MAY BE SUSPENDED BUT NOT STOPPED DURING THE PICNIC SEASON also a popular contrivance whereby love making may be suspended but not stopped during the picnic season
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
READER GENDER SUBSET NAME
2+
11 F dev-clean Wolverine
3+
12 M dev-clean Hulk
4+
13 F dev-other Zimmer
5+
14 M dev-other Carla
6+
15 F test-clean Groot
7+
16 M test-clean Tony
8+
17 F test-other Anita
9+
18 M test-other John
10+
19 F train-clean-100 Denny
11+
20 M train-clean-100 Sean
12+
21 F train-clean-360 Kristin
13+
22 M train-clean-360 Linton
14+
23 F train-other-500 Annie
15+
24 M train-other-500 Martin
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
17_07_0000_000 HE BEGAN A CONFUSED COMPLAINT AGAINST THE WIZARD WHO HAD VANISHED BEHIND THE CURTAIN ON THE LEFT he began a confused complaint against the wizard who had vanished behind the curtain on the left
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
18_08_0000_000 ALSO A POPULAR CONTRIVANCE WHEREBY LOVE MAKING MAY BE SUSPENDED BUT NOT STOPPED DURING THE PICNIC SEASON also a popular contrivance whereby love making may be suspended but not stopped during the picnic season
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
READER GENDER SUBSET NAME
2+
11 F dev-clean Wolverine
3+
12 M dev-clean Hulk
4+
13 F dev-other Zimmer
5+
14 M dev-other Carla
6+
15 F test-clean Groot
7+
16 M test-clean Tony
8+
17 F test-other Anita
9+
18 M test-other John
10+
19 F train-clean-100 Denny
11+
20 M train-clean-100 Sean
12+
21 F train-clean-360 Kristin
13+
22 M train-clean-360 Linton
14+
23 F train-other-500 Annie
15+
24 M train-other-500 Martin
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
19_09_0000_000 GO DO YOU HEAR go do you hear
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
20_10_0000_000 FORGOTTEN TOO THE NAME OF GILLIAN THE LOVELY CAPTIVE forgotten too the name of gillian the lovely captive
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
READER GENDER SUBSET NAME
2+
11 F dev-clean Wolverine
3+
12 M dev-clean Hulk
4+
13 F dev-other Zimmer
5+
14 M dev-other Carla
6+
15 F test-clean Groot
7+
16 M test-clean Tony
8+
17 F test-other Anita
9+
18 M test-other John
10+
19 F train-clean-100 Denny
11+
20 M train-clean-100 Sean
12+
21 F train-clean-360 Kristin
13+
22 M train-clean-360 Linton
14+
23 F train-other-500 Annie
15+
24 M train-other-500 Martin
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
21_11_0000_000 GO DO YOU HEAR go do you hear
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
22_12_0000_000 FORGOTTEN TOO THE NAME OF GILLIAN THE LOVELY CAPTIVE forgotten too the name of gillian the lovely captive
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
READER GENDER SUBSET NAME
2+
11 F dev-clean Wolverine
3+
12 M dev-clean Hulk
4+
13 F dev-other Zimmer
5+
14 M dev-other Carla
6+
15 F test-clean Groot
7+
16 M test-clean Tony
8+
17 F test-other Anita
9+
18 M test-other John
10+
19 F train-clean-100 Denny
11+
20 M train-clean-100 Sean
12+
21 F train-clean-360 Kristin
13+
22 M train-clean-360 Linton
14+
23 F train-other-500 Annie
15+
24 M train-other-500 Martin
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
23_13_0000_000 GO DO YOU HEAR go do you hear
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
24_14_0000_000 FORGOTTEN TOO THE NAME OF GILLIAN THE LOVELY CAPTIVE forgotten too the name of gillian the lovely captive
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
http://www.openslr.org/resources/60/dev-clean.tar.gz 1291469655 da0864e1bd26debed35da8a869dd5c04dfc27682921936de7cff9c8a254dbe1a
2+
http://www.openslr.org/resources/60/dev-other.tar.gz 924804676 d413eda26f3a152ac7c9cf3658ef85504dfb1b625296e5fa83727f5186cca79c
3+
http://www.openslr.org/resources/60/test-clean.tar.gz 1230670113 234ea5b25859102a87024a4b9b86641f5b5aaaf1197335c95090cde04fe9a4f5
4+
http://www.openslr.org/resources/60/test-other.tar.gz 964502297 33a5342094f3bba7ccc2e0500b9e72d558f72eb99328ac8debe1d9080402f10d
5+
http://www.openslr.org/resources/60/train-clean-100.tar.gz 7723686890 c5608bf1ef74bb621935382b8399c5cdd51cd3ee47cec51f00f885a64c6c7f6b
6+
http://www.openslr.org/resources/60/train-clean-360.tar.gz 27504073644 ce7cff44dcac46009d18379f37ef36551123a1dc4e5c8e4eb73ae57260de4886
7+
http://www.openslr.org/resources/60/train-other-500.tar.gz 44565031479 e35f7e34deeb2e2bdfe4403d88c8fdd5fbf64865cae41f027a185a6965f0a5df

0 commit comments

Comments
 (0)