Skip to content

Commit 5fa227b

Browse files
added dataprep scripts for voxceleb1 v2
1 parent cc972eb commit 5fa227b

File tree

8 files changed

+324
-55
lines changed

8 files changed

+324
-55
lines changed

egs/voxceleb/v1.1/run_001_prepare_data.sh

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,16 @@ config_file=default_config.sh
1515

1616

1717
if [ $stage -le 1 ];then
18-
19-
# Prepare the VoxCeleb2 dataset for training.
20-
local/make_voxceleb2cat.pl $voxceleb2_root dev 16 data/voxceleb2cat_train
21-
# local/make_voxceleb2cat.pl $voxceleb2_root test 16 data/voxceleb2cat_test
22-
# utils/combine_data.sh data/voxceleb2cat data/voxceleb2cat_train data/voxceleb2cat_test
18+
# Prepare the VoxCeleb2 dataset for training.
19+
local/make_voxceleb2cat.pl $voxceleb2_root dev 16 data/voxceleb2cat_train
20+
# local/make_voxceleb2cat.pl $voxceleb2_root test 16 data/voxceleb2cat_test
21+
# utils/combine_data.sh data/voxceleb2cat data/voxceleb2cat_train data/voxceleb2cat_test
2322
fi
2423

2524
if [ $stage -le 2 ];then
26-
# prepare voxceleb1 for test
27-
local/make_voxceleb1_oeh.pl $voxceleb1_root data
25+
# prepare voxceleb1 for test
26+
# This script is for the old version of the dataset
27+
local/make_voxceleb1_oeh.pl $voxceleb1_root data
28+
# Use this for the newer version of voxceleb1:
29+
# local/make_voxceleb1_v2_oeh.pl $voxceleb1_root data
2830
fi

egs/voxceleb/v1/local/make_voxceleb1_o.pl

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,13 @@
55
# 2020 Jesus Villalba
66
#
77
# Usage: make_voxceleb1.pl /export/voxceleb1 data/
8-
# Create trial lists for Voxceleb1 original, Entire (E) and hard (H),
8+
# Create trial lists for Voxceleb1 original,
99
# with cleaned and non-cleaned versions
10+
# Attention:
11+
# - This script is for the old version of the dataset without anonymized speaker-ids
12+
# - This script assumes that the voxceleb1 dataset has all speaker directories
13+
# dumped in the same wav directory, NOT separated dev and test directories
14+
1015

1116
if (@ARGV != 2) {
1217
print STDERR "Usage: $0 <path-to-voxceleb1> <path-to-data-dir>\n";

egs/voxceleb/v1/local/make_voxceleb1_oeh.pl

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,10 @@
77
# Usage: make_voxceleb1.pl /export/voxceleb1 data/
88
# Create trial lists for Voxceleb1 original, Entire (E) and hard (H),
99
# with cleaned and non-cleaned versions
10+
# Attention:
11+
# - This script is for the old version of the dataset without anonymized speaker-ids
12+
# - This script assumes that the voxceleb1 dataset has all speaker directories
13+
# dumped in the same wav directory, NOT separated dev and test directories
1014

1115
if (@ARGV != 2) {
1216
print STDERR "Usage: $0 <path-to-voxceleb1> <path-to-data-dir>\n";

egs/voxceleb/v1/local/make_voxceleb1_v2.pl

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,11 @@
66
#
77
# Apache 2.0
88
# Usage: make_voxceleb1_v2.pl /export/voxceleb1 data/
9-
# This version of the script does NOT remove SITW overlap speakers
10-
# Files from the same video are NOT concatenated into 1 segment
9+
# Attention:
10+
# - This script is for the recent version of the dataset
11+
# - This version of the script does NOT remove SITW overlap speakers
12+
# - Files from the same video are NOT concatenated into 1 segment
13+
# - This script assumes that the voxceleb1 dataset has all speaker directories dumped in the same wav directory, NOT separated dev and test directories
1114

1215
if (@ARGV != 3) {
1316
print STDERR "Usage: $0 <path-to-voxceleb1> fs <path-to-data-dir>\n";
Lines changed: 211 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,211 @@
1+
#!/usr/bin/perl
2+
#
3+
# Copyright 2018 Ewald Enzinger
4+
# 2018 David Snyder
5+
# 2020 Jesus Villalba
6+
#
7+
# Usage: make_voxceleb1.pl /export/voxceleb1 data/
8+
# Create trial lists for Voxceleb1 original,
9+
# with cleaned and non-cleaned versions
10+
# Attention:
11+
# - This script is for the recent version of the dataset
12+
# - This script assumes that the voxceleb1 dataset has all speaker directories
13+
# dumped in the same wav directory, NOT separated dev and test directories
14+
15+
if (@ARGV != 2) {
16+
print STDERR "Usage: $0 <path-to-voxceleb1> <path-to-data-dir>\n";
17+
print STDERR "e.g. $0 /export/voxceleb1 data/\n";
18+
exit(1);
19+
}
20+
21+
($data_base, $out_dir) = @ARGV;
22+
my $out_dir = "$out_dir/voxceleb1_test";
23+
24+
if (system("mkdir -p $out_dir") != 0) {
25+
die "Error making directory $out_dir";
26+
}
27+
28+
my $url_base="http://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta";
29+
my @trials_basename = ("very_test.txt", "very_test2.txt", "list_test_hard.txt", "list_test_hard2.txt", "list_test_all.txt", "list_test_all2.txt");
30+
my @trials_url = ("$url_base/veri_test.txt", "$url_base/veri_test2.txt");
31+
my @trials = ("trials_o", "trials_o_clean");
32+
33+
my $meta_url = "https://www.openslr.org/resources/49/vox1_meta.csv";
34+
my $meta_path = "$data_base/vox1_meta.csv";
35+
if (! -e "$meta_path") {
36+
$meta_path = "$out_dir/vox1_meta.csv";
37+
system("wget -O $meta_path $meta_url");
38+
}
39+
40+
open(META_IN, "<", "$meta_path") or die "Could not open the meta data file $meta_path";
41+
my %id2spkr = ();
42+
my %spkr2gender = ();
43+
my %spkr2nation = ();
44+
while (<META_IN>) {
45+
chomp;
46+
my ($vox_id, $spkr_id, $gender, $nation, $set) = split "\t";
47+
$spkr2gender{$vox_id} = $gender;
48+
$nation =~ s@ @-@g;
49+
$spkr2nation{$vox_id} = $nation;
50+
}
51+
close(META_IN) or die;
52+
53+
my $lid_url = "https://www.robots.ox.ac.uk/~vgg/data/voxceleb/data_workshop_2021/lang_vox1_final.csv";
54+
my $lid_path = "$data_base/lang_vox1_final.csv";
55+
if (! -e "$lid_path") {
56+
$lid_path = "$out_dir/lang_vox1_final.csv";
57+
system("wget -O $lid_path $lid_url");
58+
}
59+
open(LID_IN, "<", "$lid_path") or die "Could not open the output file $lid_path";
60+
my %utt2lang = ();
61+
while (<LID_IN>) {
62+
chomp;
63+
my ($utt_id, $lang, $score) = split ',';
64+
my ($vox_id, $vid_id, $file_id) = split '/', $utt_id;
65+
my $utt_id = "$vox_id-$vid_id-$file_id";
66+
$utt_id =~ s@\.wav$@@;
67+
$utt2lang{$utt_id} = $lang;
68+
}
69+
close(LID_IN) or die;
70+
71+
#download trials from voxceleb web page
72+
for($i = 0; $i <= $#trials; $i++) {
73+
74+
my $file_i = "$out_dir/$trials_basename[$i]";
75+
my $url_i = $trials_url[$i];
76+
my $trial_i = "$out_dir/$trials[$i]";
77+
if (! -e $file_i) {
78+
system("wget -O $file_i $url_i");
79+
}
80+
#mapping from new speaker ids and file-names to old ones
81+
open(TRIAL_IN, "<", "$file_i") or die "Could not open the verification trials file $file_i";
82+
open(TRIAL_OUT, ">", "$trial_i") or die "Could not open the output file $trial_i";
83+
while (<TRIAL_IN>) {
84+
chomp;
85+
my ($tar_or_non, $path1, $path2) = split;
86+
87+
# Create entry for left-hand side of trial
88+
my ($spkr_id, $rec_id, $segment) = split('/', $path1);
89+
$segment =~ s/\.wav$//;
90+
my $utt_id1 = "$spkr_id-$rec_id-$segment";
91+
92+
# Create entry for right-hand side of trial
93+
my ($spkr_id, $rec_id, $segment) = split('/', $path2);
94+
$segment =~ s/\.wav$//;
95+
my $utt_id2 = "$spkr_id-$rec_id-$segment";
96+
97+
my $target = "nontarget";
98+
if ($tar_or_non eq "1") {
99+
$target = "target";
100+
}
101+
print TRIAL_OUT "$utt_id1 $utt_id2 $target\n";
102+
}
103+
104+
close(TRIAL_IN) or die;
105+
close(TRIAL_OUT) or die;
106+
107+
}
108+
109+
my $wav_dir = "$data_base/wav";
110+
opendir my $dh, "$wav_dir" or die "Cannot open directory: $!";
111+
my @spkr_dirs = grep {-d "$wav_dir/$_" && ! /^\.{1,2}$/ || -l "$wav_dir/$_" } readdir($dh);
112+
closedir $dh;
113+
114+
open(SPKR, ">", "$out_dir/utt2spk") or die "Could not open the output file $out_dir/utt2spk";
115+
open(WAV, ">", "$out_dir/wav.scp") or die "Could not open the output file $out_dir/wav.scp";
116+
open(GENDER, ">", "$out_dir/spk2gender") or die "Could not open the output file $out_dir/spk2gender";
117+
open(NAT, ">", "$out_dir/spk2nation") or die "Could not open the output file $out_dir/spk2nation";
118+
open(LANG, ">", "$out_dir/utt2lang") or die "Could not open the output file $out_dir/utt2lang";
119+
120+
foreach (@spkr_dirs) {
121+
my $spkr_id = $_;
122+
123+
print GENDER "$spkr_id $spkr2gender{$spkr_id}\n";
124+
print NAT "$spkr_id $spkr2nation{$spkr_id}\n";
125+
126+
my $spkr_dir = "$wav_dir/$spkr_id";
127+
opendir my $dh, "$spkr_dir" or die "Cannot open directory: $!";
128+
my @vid_dirs = grep {-d "$spkr_dir/$_" && ! /^\.{1,2}$/ } readdir($dh);
129+
my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh);
130+
closedir $dh;
131+
foreach (@vid_dirs) {
132+
my $vid_id = $_;
133+
my $vid_dir = "$spkr_dir/$vid_id";
134+
opendir my $dh, "$vid_dir" or die "Cannot open directory: $!";
135+
my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh);
136+
closedir $dh;
137+
foreach (@files) {
138+
my $segment = $_;
139+
my $wav = "$vid_dir/$segment.wav";
140+
my $utt_id = "$spkr_id-$vid_id-$segment";
141+
if($fs == 8){
142+
$wav = "sox " . $wav . " -t wav -r 8k - |";
143+
}
144+
print WAV "$utt_id", " $wav", "\n";
145+
print SPKR "$utt_id", " $spkr_id", "\n";
146+
if (exists $utt2lang{$utt_id}) {
147+
print LANG "$utt_id", " $utt2lang{$utt_id}", "\n";
148+
}
149+
else {
150+
print LANG "$utt_id N/A\n";
151+
}
152+
}
153+
}
154+
}
155+
156+
# foreach (@spkr_dirs) {
157+
# my $spkr_id = $_;
158+
# my $new_spkr_id = $spkr_id;
159+
# # If we're using a newer version of VoxCeleb1, we need to "deanonymize"
160+
# # the speaker labels.
161+
# if (exists $id2spkr{$spkr_id}) {
162+
# $new_spkr_id = $id2spkr{$spkr_id};
163+
# }
164+
# print GENDER "$new_spkr_id $spkr2gender{$new_spkr_id}\n";
165+
# print NAT "$new_spkr_id $spkr2nation{$new_spkr_id}\n";
166+
167+
# opendir my $dh, "$wav_dir/$spkr_id/" or die "Cannot open directory: $!";
168+
# my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh);
169+
# closedir $dh;
170+
# foreach (@files) {
171+
# my $filename = $_;
172+
# my $rec_id = substr($filename, 0, 11);
173+
# my $segment = substr($filename, 12, 7);
174+
# my $wav = "$data_base/voxceleb1_wav/$spkr_id/$filename.wav";
175+
# my $utt_id = "$new_spkr_id-$rec_id-$segment";
176+
# print WAV "$utt_id", " $wav", "\n";
177+
# print SPKR "$utt_id", " $new_spkr_id", "\n";
178+
# if (exists $utt2lang{$utt_id}) {
179+
# print LANG "$utt_id", " $utt2lang{$utt_id}", "\n";
180+
# }
181+
# else {
182+
# print LANG "$utt_id N/A\n";
183+
# }
184+
# }
185+
# }
186+
187+
close(SPKR) or die;
188+
close(WAV) or die;
189+
close(LANG) or die;
190+
close(GENDER) or die;
191+
close(NAT) or die;
192+
193+
if (system(
194+
"cat $out_dir/trials_* | sort -u > $out_dir/trials") != 0) {
195+
die "Error creating trials file in directory $out_dir";
196+
}
197+
198+
if (system(
199+
"awk '{ print \$1,\$1 }' $out_dir/trials | sort -u > $out_dir/utt2model") != 0) {
200+
die "Error creating utt2model file in directory $out_dir";
201+
}
202+
203+
if (system(
204+
"utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) {
205+
die "Error creating spk2utt file in directory $out_dir";
206+
}
207+
system("env LC_COLLATE=C utils/fix_data_dir.sh $out_dir");
208+
if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) {
209+
die "Error validating directory $out_dir";
210+
}
211+

0 commit comments

Comments
 (0)