Skip to content

Commit af68c5d

Browse files
TensorFlow Datasets Teamcopybara-github
TensorFlow Datasets Team
authored andcommitted
Add Speech Commands to TFDS
PiperOrigin-RevId: 297141265
1 parent 090eb9f commit af68c5d

File tree

6 files changed

+229
-0
lines changed

6 files changed

+229
-0
lines changed

tensorflow_datasets/audio/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,4 @@
1919
from tensorflow_datasets.audio.librispeech import Librispeech
2020
from tensorflow_datasets.audio.librispeech import LibrispeechConfig
2121
from tensorflow_datasets.audio.nsynth import Nsynth
22+
from tensorflow_datasets.audio.speech_commands import SpeechCommands
Lines changed: 187 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,187 @@
1+
# coding=utf-8
2+
# Copyright 2020 The TensorFlow Datasets Authors.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
"""SpeechCommands dataset."""
17+
18+
from __future__ import absolute_import
19+
from __future__ import division
20+
from __future__ import print_function
21+
22+
import os
23+
import numpy as np
24+
25+
from tensorflow_datasets.core import lazy_imports_lib
26+
import tensorflow_datasets.public_api as tfds
27+
28+
_CITATION = """
29+
@article{speechcommandsv2,
30+
author = {{Warden}, P.},
31+
title = "{Speech Commands: A Dataset for Limited-Vocabulary Speech Recognition}",
32+
journal = {ArXiv e-prints},
33+
archivePrefix = "arXiv",
34+
eprint = {1804.03209},
35+
primaryClass = "cs.CL",
36+
keywords = {Computer Science - Computation and Language, Computer Science - Human-Computer Interaction},
37+
year = 2018,
38+
month = apr,
39+
url = {https://arxiv.org/abs/1804.03209},
40+
}
41+
"""
42+
43+
_DESCRIPTION = """
44+
An audio dataset of spoken words designed to help train and evaluate keyword
45+
spotting systems. Its primary goal is to provide a way to build and test small
46+
models that detect when a single word is spoken, from a set of ten target words,
47+
with as few false positives as possible from background noise or unrelated
48+
speech. Note that in the train and validation set, the label "unknown" is much
49+
more prevalent than the labels of the target words or background noise.
50+
One difference from the release version is the handling of silent segments.
51+
While in the test set the silence segments are regular 1 second files, in the
52+
training they are provided as long segments under "background_noise" folder.
53+
Here we split these background noise into 1 second clips, and also keep one of
54+
the files for the validation set.
55+
"""
56+
57+
_DOWNLOAD_PATH = 'http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz'
58+
_TEST_DOWNLOAD_PATH_ = 'http://download.tensorflow.org/data/speech_commands_test_set_v0.02.tar.gz'
59+
60+
_SPLITS = ['train', 'valid', 'test']
61+
62+
WORDS = ['down', 'go', 'left', 'no', 'off', 'on', 'right', 'stop', 'up', 'yes']
63+
SILENCE = '_silence_'
64+
UNKNOWN = '_unknown_'
65+
BACKGROUND_NOISE = '_background_noise_'
66+
SAMPLE_RATE = 16000
67+
68+
69+
class SpeechCommands(tfds.core.GeneratorBasedBuilder):
70+
"""The Speech Commands dataset for keyword detection."""
71+
72+
VERSION = tfds.core.Version('0.0.2')
73+
74+
def _info(self):
75+
return tfds.core.DatasetInfo(
76+
builder=self,
77+
description=_DESCRIPTION,
78+
# tfds.features.FeatureConnectors
79+
features=tfds.features.FeaturesDict({
80+
'audio': tfds.features.Audio(file_format='wav'),
81+
'label': tfds.features.ClassLabel(names=WORDS + [SILENCE, UNKNOWN])
82+
}),
83+
supervised_keys=('audio', 'label'),
84+
# Homepage of the dataset for documentation
85+
homepage='https://arxiv.org/abs/1804.03209',
86+
citation=_CITATION,
87+
)
88+
89+
def _split_generators(self, dl_manager):
90+
"""Returns SplitGenerators."""
91+
92+
dl_path, dl_test_path = dl_manager.download(
93+
[_DOWNLOAD_PATH, _TEST_DOWNLOAD_PATH_])
94+
95+
train_paths, validation_paths = self._split_archive(
96+
dl_manager.iter_archive(dl_path))
97+
98+
return [
99+
tfds.core.SplitGenerator(
100+
name=tfds.Split.TRAIN,
101+
gen_kwargs={'archive': dl_manager.iter_archive(dl_path),
102+
'file_list': train_paths},
103+
),
104+
tfds.core.SplitGenerator(
105+
name=tfds.Split.VALIDATION,
106+
gen_kwargs={'archive': dl_manager.iter_archive(dl_path),
107+
'file_list': validation_paths},
108+
),
109+
tfds.core.SplitGenerator(
110+
name=tfds.Split.TEST,
111+
gen_kwargs={'archive': dl_manager.iter_archive(dl_test_path),
112+
'file_list': None},
113+
),
114+
]
115+
116+
def _generate_examples(self, archive, file_list):
117+
"""Yields examples."""
118+
for path, file_obj in archive:
119+
if file_list is not None and path not in file_list:
120+
continue
121+
relpath, wavname = os.path.split(path)
122+
_, word = os.path.split(relpath)
123+
example_id = '{}_{}'.format(word, wavname)
124+
if word in WORDS:
125+
label = word
126+
elif word == SILENCE or word == BACKGROUND_NOISE:
127+
# The main tar file already contains all of the test files, except for
128+
# the silence ones. In fact it does not contain silence files at all.
129+
# So for the test set we take the silence files from the test tar file,
130+
# while for train and validation we build them from the
131+
# _background_noise_ folder.
132+
label = SILENCE
133+
else:
134+
# Note that in the train and validation there are a lot more _unknown_
135+
# labels than any of the other ones.
136+
label = UNKNOWN
137+
138+
if word == BACKGROUND_NOISE:
139+
# Special handling of background noise. We need to cut these files to
140+
# many small files with 1 seconds length, and transform it to silence.
141+
audio_samples = np.array(
142+
lazy_imports_lib.lazy_imports.pydub.AudioSegment.from_file(
143+
file_obj, format='wav').get_array_of_samples())
144+
145+
for start in range(0,
146+
len(audio_samples) - SAMPLE_RATE, SAMPLE_RATE // 2):
147+
audio_segment = audio_samples[start:start + SAMPLE_RATE]
148+
cur_id = '{}_{}'.format(example_id, start)
149+
example = {'audio': audio_segment, 'label': label}
150+
yield cur_id, example
151+
else:
152+
try:
153+
example = {
154+
'audio':
155+
np.array(
156+
lazy_imports_lib.lazy_imports.pydub.AudioSegment
157+
.from_file(file_obj,
158+
format='wav').get_array_of_samples()),
159+
'label':
160+
label,
161+
}
162+
yield example_id, example
163+
except lazy_imports_lib.lazy_imports.pydub.exceptions.CouldntDecodeError:
164+
pass
165+
166+
def _split_archive(self, train_archive):
167+
train_paths = []
168+
for path, file_obj in train_archive:
169+
if 'testing_list.txt' in path:
170+
train_test_paths = file_obj.read().strip().splitlines()
171+
train_test_paths = [p.decode('ascii') for p in train_test_paths]
172+
elif 'validation_list.txt' in path:
173+
validation_paths = file_obj.read().strip().splitlines()
174+
validation_paths = [p.decode('ascii') for p in validation_paths]
175+
elif path.endswith('.wav'):
176+
train_paths.append(path)
177+
178+
# Original validation files did include silence - we add them manually here
179+
validation_paths.append(
180+
os.path.join(BACKGROUND_NOISE, 'running_tap.wav'))
181+
182+
# The paths for the train set is just whichever paths that do not exist in
183+
# either the test or validation splits.
184+
train_paths = (
185+
set(train_paths) - set(validation_paths) - set(train_test_paths))
186+
187+
return train_paths, validation_paths
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
# coding=utf-8
2+
# Copyright 2020 The TensorFlow Datasets Authors.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
"""TODO(speech_commands): Add a description here."""
17+
18+
from __future__ import absolute_import
19+
from __future__ import division
20+
from __future__ import print_function
21+
22+
from tensorflow_datasets import testing
23+
from tensorflow_datasets.audio import speech_commands
24+
25+
26+
class SpeechCommandsTest(testing.DatasetBuilderTestCase):
27+
# TODO(speech_commands):
28+
DATASET_CLASS = speech_commands.SpeechCommands
29+
SPLITS = {
30+
"train": 4, # Number of fake train example
31+
"validation": 3, # Number of fake validation example
32+
"test": 1, # Number of fake test example
33+
}
34+
35+
DL_EXTRACT_RESULT = ["train.tar.gz", "test.tar.gz"]
36+
37+
38+
if __name__ == "__main__":
39+
testing.test_main()
Binary file not shown.
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
http://download.tensorflow.org/data/speech_commands_test_set_v0.02.tar.gz 112563277 cc2a00c1147c2254e9be3fa0f779d8c17421dc349b86366567a8edfa9acd51df
2+
http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz 2428923189 af14739ee7dc311471de98f5f9d2c9191b18aedfe957f4a6ff791c709868ff58

0 commit comments

Comments
 (0)