@@ -147,6 +147,30 @@ def _vocab_text_gen(self, dirs):
147
147
for _ , example in _generate_librispeech_examples (directory ):
148
148
yield example ["text" ]
149
149
150
+ def _populate_metadata (self , dirs ):
151
+ # All dirs contain the same metadata.
152
+ directory = list (dirs .values ())[0 ]
153
+ self .info .metadata ["speakers" ] = self ._read_metadata_file (
154
+ os .path .join (directory , "LibriSpeech/SPEAKERS.TXT" ),
155
+ ["speaker_id" , "gender" , "subset" , "minutes" , "name" ])
156
+ self .info .metadata ["chapters" ] = self ._read_metadata_file (
157
+ os .path .join (directory , "LibriSpeech/CHAPTERS.TXT" ), [
158
+ "chapter_id" , "speaker_id" , "minutes" , "subset" , "project_id" ,
159
+ "book_id" , "chapter_title" , "project_title"
160
+ ])
161
+
162
+ def _read_metadata_file (self , path , field_names ):
163
+ metadata = {}
164
+ with tf .io .gfile .GFile (path ) as f :
165
+ for line in f :
166
+ if line .startswith (";" ):
167
+ continue
168
+ fields = line .split ("|" , len (field_names ))
169
+ metadata [int (fields [0 ])] = {
170
+ k : v .strip () for k , v in zip (field_names [1 :], fields [1 :])
171
+ }
172
+ return metadata
173
+
150
174
def _split_generators (self , dl_manager ):
151
175
extracted_dirs = dl_manager .download_and_extract (_DL_URLS )
152
176
# Generate vocabulary from training data if SubwordTextEncoder configured.
@@ -155,7 +179,7 @@ def _split_generators(self, dl_manager):
155
179
]
156
180
self .info .features ["text" ].maybe_build_from_corpus (
157
181
self ._vocab_text_gen (all_train_dirs ))
158
-
182
+ self . _populate_metadata ( extracted_dirs )
159
183
splits = [tfds .core .SplitGenerator (name = k , gen_kwargs = {"directory" : v })
160
184
for k , v in extracted_dirs .items ()]
161
185
return splits
0 commit comments