19
19
from __future__ import division
20
20
from __future__ import print_function
21
21
22
- import collections
23
22
import os
24
23
25
24
import tensorflow .compat .v2 as tf
38
37
"""
39
38
40
39
_DESCRIPTION = """\
41
- LibriSpeech is a corpus of approximately 1000 hours of read English speech of frequency 16 KHz,
40
+ LibriSpeech is a corpus of approximately 1000 hours of read English speech with sampling rate of 16 kHz,
42
41
prepared by Vassil Panayotov with the assistance of Daniel Povey. The data is derived from read
43
- audiobooks from the LibriVox project, and has been carefully segmented and aligned.
42
+ audiobooks from the LibriVox project, and has been carefully segmented and aligned.87
44
43
"""
45
44
46
45
_URL = "http://www.openslr.org/12"
54
53
"train_clean360" : _DL_URL + "train-clean-360.tar.gz" ,
55
54
"train_other500" : _DL_URL + "train-other-500.tar.gz" ,
56
55
}
57
- _DATA_OPTIONS = ["clean100" , "clean360" , "all" ]
58
56
59
57
60
- # TODO(tfds): Better support compositional configuration
61
58
class LibrispeechConfig (tfds .core .BuilderConfig ):
62
59
"""BuilderConfig for Librispeech."""
63
60
64
61
@tfds .core .disallow_positional_args
65
- def __init__ (self , text_encoder_config = None , data = "clean100" , ** kwargs ):
62
+ def __init__ (self , text_encoder_config = None , ** kwargs ):
66
63
"""Constructs a LibrispeechConfig.
67
64
68
65
Args:
69
66
text_encoder_config: `tfds.features.text.TextEncoderConfig`, configuration
70
67
for the `tfds.features.text.TextEncoder` used for the text feature.
71
- data: `str`, one of `(clean100, clean360, all)`. `clean100` uses only the
72
- clean data without `train-clean-360`. `clean360` uses clean data with
73
- `train-clean-360`. `all` uses all the data.
74
68
**kwargs: keyword arguments forwarded to super.
75
69
"""
76
- if data not in _DATA_OPTIONS :
77
- raise ValueError ("data must be one of %s" % _DATA_OPTIONS )
78
70
name = kwargs .get ("name" )
79
71
if name is None :
80
- encoder_name = (
81
- text_encoder_config .name if text_encoder_config else "plain_text" )
82
- data_name = data
83
- name = "%s_%s" % (data_name , encoder_name )
72
+ name = (text_encoder_config .name if text_encoder_config else "plain_text" )
84
73
kwargs ["name" ] = name
85
74
86
75
description = kwargs .get ("description" )
87
76
if description is None :
88
77
if text_encoder_config :
89
- encoder_description = "Transcriptions use the %s" % (
78
+ description = "Transcriptions use the %s" % (
90
79
text_encoder_config .encoder_cls .__name__ )
91
80
else :
92
- encoder_description = "Transcriptions are in plain text."
93
-
94
- if data == "all" :
95
- data_description = "Uses all data."
96
- else :
97
- data_description = ("Uses only clean data,%s including train-clean-360."
98
- % ("" if data == "clean360" else " not" ))
99
-
100
- description = "%s %s" % (data_description , encoder_description )
81
+ description = "Transcriptions are in plain text."
101
82
kwargs ["description" ] = description
102
83
103
84
super (LibrispeechConfig , self ).__init__ (** kwargs )
104
85
self .text_encoder_config = text_encoder_config
105
- self .data = data
106
-
107
- @property
108
- def download_urls (self ):
109
- """Returns download urls for this config."""
110
- urls = {
111
- tfds .Split .TRAIN : ["train_clean100" ],
112
- tfds .Split .VALIDATION : ["dev_clean" ],
113
- tfds .Split .TEST : ["test_clean" ],
114
- }
115
- if self .data in ["all" , "clean360" ]:
116
- urls [tfds .Split .TRAIN ].append ("train_clean360" )
117
- if self .data == "all" :
118
- urls [tfds .Split .TRAIN ].extend (["train_clean360" , "train_other500" ])
119
- urls [tfds .Split .VALIDATION ].append ("dev_other" )
120
- urls [tfds .Split .TEST ].append ("test_other" )
121
-
122
- urls = {
123
- split : [_DL_URLS [name ] for name in names
124
- ] for split , names in urls .items ()
125
- }
126
- return urls
127
86
128
87
129
88
def _make_builder_configs ():
130
89
"""Make built-in Librispeech BuilderConfigs.
131
90
132
- Uses 4 text encodings (plain text, bytes, subwords with 8k vocab, subwords
133
- with 32k vocab) crossed with the data subsets (clean100, clean360, all ).
91
+ Uses 3 text encodings (plain_text, subwords with 8k vocab, subwords with 32k
92
+ vocab).
134
93
135
94
Returns:
136
95
`list<tfds.audio.LibrispeechConfig>`
137
96
"""
138
97
text_encoder_configs = [
139
98
None ,
140
- tfds .features .text .TextEncoderConfig (
141
- name = "bytes" , encoder = tfds .features .text .ByteTextEncoder ()),
142
99
tfds .features .text .TextEncoderConfig (
143
100
name = "subwords8k" ,
144
101
encoder_cls = tfds .features .text .SubwordTextEncoder ,
@@ -149,27 +106,19 @@ def _make_builder_configs():
149
106
vocab_size = 2 ** 15 ),
150
107
]
151
108
configs = []
152
- v001 = tfds .core .Version (
153
- "0.0.1" , experiments = {tfds .core .Experiment .S3 : False })
154
- v1 = tfds .core .Version (
155
- "1.0.0" , "New split API (https://tensorflow.org/datasets/splits)" )
156
109
for text_encoder_config in text_encoder_configs :
157
- for data in _DATA_OPTIONS :
158
- config = LibrispeechConfig (
159
- version = v1 , supported_versions = [v001 ],
160
- text_encoder_config = text_encoder_config ,
161
- data = data )
162
- configs .append (config )
110
+ config = LibrispeechConfig (
111
+ version = tfds .core .Version ("1.1.0" ),
112
+ text_encoder_config = text_encoder_config )
113
+ configs .append (config )
163
114
return configs
164
115
165
116
166
- class Librispeech (tfds .core .GeneratorBasedBuilder ):
117
+ class Librispeech (tfds .core .BeamBasedBuilder ):
167
118
"""Librispeech dataset."""
168
119
169
120
BUILDER_CONFIGS = _make_builder_configs ()
170
121
171
- IN_DEVELOPMENT = True
172
-
173
122
def _info (self ):
174
123
return tfds .core .DatasetInfo (
175
124
builder = self ,
@@ -184,87 +133,57 @@ def _info(self):
184
133
tf .int64 ,
185
134
"chapter_id" :
186
135
tf .int64 ,
136
+ "id" :
137
+ tf .string ,
187
138
}),
188
139
supervised_keys = ("speech" , "text" ),
189
140
homepage = _URL ,
190
141
citation = _CITATION ,
191
142
)
192
143
193
144
def _vocab_text_gen (self , dirs ):
194
- for unused_key , example in self ._generate_examples (dirs ):
195
- yield example ["text" ]
145
+ for directory in dirs :
146
+ for _ , example in _generate_librispeech_examples (directory ):
147
+ yield example ["text" ]
196
148
197
149
def _split_generators (self , dl_manager ):
198
- extracted_dirs = dl_manager .download_and_extract (
199
- self .builder_config .download_urls )
200
- # Generate vocabulary from training data if SubwordTextEncoder configured
201
- self .info .features ["text" ].maybe_build_from_corpus (
202
- self ._vocab_text_gen (extracted_dirs [tfds .Split .TRAIN ]))
203
-
204
- return [
205
- tfds .core .SplitGenerator (
206
- name = tfds .Split .TRAIN ,
207
- num_shards = 100 ,
208
- gen_kwargs = {
209
- "dirs" : extracted_dirs [tfds .Split .TRAIN ],
210
- }),
211
- tfds .core .SplitGenerator (
212
- name = tfds .Split .VALIDATION ,
213
- num_shards = 10 ,
214
- gen_kwargs = {
215
- "dirs" : extracted_dirs [tfds .Split .VALIDATION ],
216
- }),
217
- tfds .core .SplitGenerator (
218
- name = tfds .Split .TEST ,
219
- num_shards = 10 ,
220
- gen_kwargs = {
221
- "dirs" : extracted_dirs [tfds .Split .TEST ],
222
- }),
150
+ extracted_dirs = dl_manager .download_and_extract (_DL_URLS )
151
+ # Generate vocabulary from training data if SubwordTextEncoder configured.
152
+ all_train_dirs = [
153
+ v for k , v in extracted_dirs .items () if k .startswith ("train" )
223
154
]
224
-
225
- def _generate_examples (self , dirs ):
226
- for directory in dirs :
227
- for example in _walk_librispeech_dir (directory ):
228
- record = {
229
- "speech" : example .audio_file ,
230
- "text" : example .transcript ,
231
- "speaker_id" : example .speaker_id ,
232
- "chapter_id" : example .chapter_id ,
233
- }
234
- yield "%s/%s" % (example .speaker_id , example .chapter_id ), record
235
-
236
-
237
- LibrispeechExample = collections .namedtuple (
238
- "_LibrispeechExample" ,
239
- ["speaker_id" , "chapter_id" , "audio_file" , "transcript" ])
240
-
241
-
242
- def _walk_librispeech_dir (directory ):
243
- """Walk a Librispeech directory and yield examples."""
244
- directory = os .path .join (directory , "LibriSpeech" )
245
- for path , _ , files in tf .io .gfile .walk (directory ):
246
- if not files :
247
- continue
248
-
249
- transcript_file = [f for f in files if f .endswith (".txt" )]
250
- if not transcript_file :
251
- continue
252
- assert len (transcript_file ) == 1
253
- transcript_file , = transcript_file
254
- transcripts = {}
155
+ self .info .features ["text" ].maybe_build_from_corpus (
156
+ self ._vocab_text_gen (all_train_dirs ))
157
+
158
+ splits = [tfds .core .SplitGenerator (name = k , gen_kwargs = {"directory" : v })
159
+ for k , v in extracted_dirs .items ()]
160
+ return splits
161
+
162
+ def _build_pcollection (self , pipeline , directory ):
163
+ """Generates examples as dicts."""
164
+ beam = tfds .core .lazy_imports .apache_beam
165
+ return (pipeline
166
+ | beam .Create ([directory ])
167
+ | beam .FlatMap (_generate_librispeech_examples )
168
+ | beam .Reshuffle ())
169
+
170
+
171
+ def _generate_librispeech_examples (directory ):
172
+ """Generate examples from a Librispeech directory."""
173
+ transcripts_glob = os .path .join (directory , "LibriSpeech" , "*/*/*/*.txt" )
174
+ for transcript_file in tf .io .gfile .glob (transcripts_glob ):
175
+ path = os .path .dirname (transcript_file )
255
176
with tf .io .gfile .GFile (os .path .join (path , transcript_file )) as f :
256
177
for line in f :
257
178
line = line .strip ()
258
179
key , transcript = line .split (" " , 1 )
259
- transcripts [key ] = transcript
260
- audio_files = [f for f in files if not f .endswith (".txt" )]
261
- for audio_file in audio_files :
262
- assert audio_file .endswith (".flac" )
263
- key = audio_file [:- len (".flac" )]
264
- transcript = transcripts [key ]
265
- speaker_id , chapter_id = [int (el ) for el in key .split ("-" )[:2 ]]
266
- yield LibrispeechExample (
267
- speaker_id = speaker_id ,
268
- chapter_id = chapter_id ,
269
- audio_file = os .path .join (path , audio_file ),
270
- transcript = transcript )
180
+ audio_file = "%s.flac" % key
181
+ speaker_id , chapter_id = [int (el ) for el in key .split ("-" )[:2 ]]
182
+ example = {
183
+ "id" : key ,
184
+ "speaker_id" : speaker_id ,
185
+ "chapter_id" : chapter_id ,
186
+ "speech" : os .path .join (path , audio_file ),
187
+ "text" : transcript
188
+ }
189
+ yield key , example
0 commit comments