Skip to content

Commit 090eb9f

Browse files
ronwcopybara-github
authored andcommitted
Expose corpus-wide speaker and chapter metadata in dataset metadata.
PiperOrigin-RevId: 296923185
1 parent 3c69b42 commit 090eb9f

File tree

1 file changed

+25
-1
lines changed

1 file changed

+25
-1
lines changed

tensorflow_datasets/audio/librispeech.py

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,30 @@ def _vocab_text_gen(self, dirs):
147147
for _, example in _generate_librispeech_examples(directory):
148148
yield example["text"]
149149

150+
def _populate_metadata(self, dirs):
151+
# All dirs contain the same metadata.
152+
directory = list(dirs.values())[0]
153+
self.info.metadata["speakers"] = self._read_metadata_file(
154+
os.path.join(directory, "LibriSpeech/SPEAKERS.TXT"),
155+
["speaker_id", "gender", "subset", "minutes", "name"])
156+
self.info.metadata["chapters"] = self._read_metadata_file(
157+
os.path.join(directory, "LibriSpeech/CHAPTERS.TXT"), [
158+
"chapter_id", "speaker_id", "minutes", "subset", "project_id",
159+
"book_id", "chapter_title", "project_title"
160+
])
161+
162+
def _read_metadata_file(self, path, field_names):
163+
metadata = {}
164+
with tf.io.gfile.GFile(path) as f:
165+
for line in f:
166+
if line.startswith(";"):
167+
continue
168+
fields = line.split("|", len(field_names))
169+
metadata[int(fields[0])] = {
170+
k: v.strip() for k, v in zip(field_names[1:], fields[1:])
171+
}
172+
return metadata
173+
150174
def _split_generators(self, dl_manager):
151175
extracted_dirs = dl_manager.download_and_extract(_DL_URLS)
152176
# Generate vocabulary from training data if SubwordTextEncoder configured.
@@ -155,7 +179,7 @@ def _split_generators(self, dl_manager):
155179
]
156180
self.info.features["text"].maybe_build_from_corpus(
157181
self._vocab_text_gen(all_train_dirs))
158-
182+
self._populate_metadata(extracted_dirs)
159183
splits = [tfds.core.SplitGenerator(name=k, gen_kwargs={"directory": v})
160184
for k, v in extracted_dirs.items()]
161185
return splits

0 commit comments

Comments
 (0)