Skip to content

Commit 937c95c

Browse files
Merge pull request #73 from hyperion-ml/lachesis
Added Dataprep scripts for VoxCeleb1 v2
2 parents a62157c + 5fa227b commit 937c95c

11 files changed

+1014
-56
lines changed

egs/voxceleb/v1.1/run_001_prepare_data.sh

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,16 @@ config_file=default_config.sh
1515

1616

1717
if [ $stage -le 1 ];then
18-
19-
# Prepare the VoxCeleb2 dataset for training.
20-
local/make_voxceleb2cat.pl $voxceleb2_root dev 16 data/voxceleb2cat_train
21-
# local/make_voxceleb2cat.pl $voxceleb2_root test 16 data/voxceleb2cat_test
22-
# utils/combine_data.sh data/voxceleb2cat data/voxceleb2cat_train data/voxceleb2cat_test
18+
# Prepare the VoxCeleb2 dataset for training.
19+
local/make_voxceleb2cat.pl $voxceleb2_root dev 16 data/voxceleb2cat_train
20+
# local/make_voxceleb2cat.pl $voxceleb2_root test 16 data/voxceleb2cat_test
21+
# utils/combine_data.sh data/voxceleb2cat data/voxceleb2cat_train data/voxceleb2cat_test
2322
fi
2423

2524
if [ $stage -le 2 ];then
26-
# prepare voxceleb1 for test
27-
local/make_voxceleb1_oeh.pl $voxceleb1_root data
25+
# prepare voxceleb1 for test
26+
# This script is for the old version of the dataset
27+
local/make_voxceleb1_oeh.pl $voxceleb1_root data
28+
# Use this for the newer version of voxceleb1:
29+
# local/make_voxceleb1_v2_oeh.pl $voxceleb1_root data
2830
fi

egs/voxceleb/v1/local/make_voxceleb1_o.pl

Lines changed: 61 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,13 @@
55
# 2020 Jesus Villalba
66
#
77
# Usage: make_voxceleb1.pl /export/voxceleb1 data/
8-
# Create trial lists for Voxceleb1 original, Entire (E) and hard (H),
8+
# Create trial lists for Voxceleb1 original,
99
# with cleaned and non-cleaned versions
10+
# Attention:
11+
# - This script is for the old version of the dataset without anonymized speaker-ids
12+
# - This script assumes that the voxceleb1 dataset has all speaker directories
13+
# dumped in the same wav directory, NOT separated dev and test directories
14+
1015

1116
if (@ARGV != 2) {
1217
print STDERR "Usage: $0 <path-to-voxceleb1> <path-to-data-dir>\n";
@@ -26,18 +31,47 @@
2631
my @trials_url = ("$url_base/veri_test.txt", "$url_base/veri_test2.txt");
2732
my @trials = ("trials_o", "trials_o_clean");
2833

29-
open(META_IN, "<", "$data_base/vox1_meta.csv") or die "Could not open the meta data file $data_base/vox1_meta.csv";
34+
my $meta_url = "https://www.openslr.org/resources/49/vox1_meta.csv";
35+
my $meta_path = "$data_base/vox1_meta.csv";
36+
if (! -e "$meta_path") {
37+
$meta_path = "$out_dir/vox1_meta.csv";
38+
system("wget -O $meta_path $meta_url");
39+
}
40+
41+
open(META_IN, "<", "$meta_path") or die "Could not open the meta data file $meta_path";
3042
my %id2spkr = ();
43+
my %spkr2gender = ();
44+
my %spkr2nation = ();
3145
while (<META_IN>) {
32-
chomp;
33-
my ($vox_id, $spkr_id, $gender, $nation, $set) = split;
34-
$id2spkr{$vox_id} = $spkr_id;
35-
46+
chomp;
47+
my ($vox_id, $spkr_id, $gender, $nation, $set) = split "\t";
48+
$id2spkr{$vox_id} = $spkr_id;
49+
$spkr2gender{$spkr_id} = $gender;
50+
$nation =~ s@ @-@g;
51+
$spkr2nation{$spkr_id} = $nation;
3652
}
3753
close(META_IN) or die;
3854

55+
my $lid_url = "https://www.robots.ox.ac.uk/~vgg/data/voxceleb/data_workshop_2021/lang_vox1_final.csv";
56+
my $lid_path = "$data_base/lang_vox1_final.csv";
57+
if (! -e "$lid_path") {
58+
$lid_path = "$out_dir/lang_vox1_final.csv";
59+
system("wget -O $lid_path $lid_url");
60+
}
61+
open(LID_IN, "<", "$lid_path") or die "Could not open the output file $lid_path";
62+
my %utt2lang = ();
63+
while (<LID_IN>) {
64+
chomp;
65+
my ($utt_id, $lang, $score) = split ',';
66+
my ($vox_id, $vid_id, $file_id) = split '/', $utt_id;
67+
my $spkr_id = $id2spkr{$vox_id};
68+
my $utt_id = "$spkr_id-$vid_id-00$file_id";
69+
$utt_id =~ s@\.wav$@@;
70+
$utt2lang{$utt_id} = $lang;
71+
}
72+
close(LID_IN) or die;
73+
3974
#download trials from voxceleb web page
40-
my %valid_utts = ();
4175
for($i = 0; $i <= $#trials; $i++) {
4276

4377
my $file_i = "$out_dir/$trials_basename[$i]";
@@ -70,8 +104,6 @@
70104
$target = "target";
71105
}
72106
print TRIAL_OUT "$utt_id1 $utt_id2 $target\n";
73-
$valid_utts{$utt_id1} = 1;
74-
$valid_utts{$utt_id2} = 1;
75107
}
76108

77109
close(TRIAL_IN) or die;
@@ -84,8 +116,11 @@
84116
my @spkr_dirs = grep {-d "$data_base/voxceleb1_wav/$_" && ! /^\.{1,2}$/} readdir($dh);
85117
closedir $dh;
86118

87-
open(SPKR_TEST, ">", "$out_dir/utt2spk") or die "Could not open the output file $out_dir/utt2spk";
88-
open(WAV_TEST, ">", "$out_dir/wav.scp") or die "Could not open the output file $out_dir/wav.scp";
119+
open(SPKR, ">", "$out_dir/utt2spk") or die "Could not open the output file $out_dir/utt2spk";
120+
open(WAV, ">", "$out_dir/wav.scp") or die "Could not open the output file $out_dir/wav.scp";
121+
open(GENDER, ">", "$out_dir/spk2gender") or die "Could not open the output file $out_dir/spk2gender";
122+
open(NAT, ">", "$out_dir/spk2nation") or die "Could not open the output file $out_dir/spk2nation";
123+
open(LANG, ">", "$out_dir/utt2lang") or die "Could not open the output file $out_dir/utt2lang";
89124

90125
foreach (@spkr_dirs) {
91126
my $spkr_id = $_;
@@ -95,6 +130,9 @@
95130
if (exists $id2spkr{$spkr_id}) {
96131
$new_spkr_id = $id2spkr{$spkr_id};
97132
}
133+
print GENDER "$new_spkr_id $spkr2gender{$new_spkr_id}\n";
134+
print NAT "$new_spkr_id $spkr2nation{$new_spkr_id}\n";
135+
98136
opendir my $dh, "$data_base/voxceleb1_wav/$spkr_id/" or die "Cannot open directory: $!";
99137
my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh);
100138
closedir $dh;
@@ -104,15 +142,22 @@
104142
my $segment = substr($filename, 12, 7);
105143
my $wav = "$data_base/voxceleb1_wav/$spkr_id/$filename.wav";
106144
my $utt_id = "$new_spkr_id-$rec_id-$segment";
107-
if (exists $valid_utts{$utt_id}) {
108-
print WAV_TEST "$utt_id", " $wav", "\n";
109-
print SPKR_TEST "$utt_id", " $new_spkr_id", "\n";
145+
print WAV "$utt_id", " $wav", "\n";
146+
print SPKR "$utt_id", " $new_spkr_id", "\n";
147+
if (exists $utt2lang{$utt_id}) {
148+
print LANG "$utt_id", " $utt2lang{$utt_id}", "\n";
149+
}
150+
else {
151+
print LANG "$utt_id N/A\n";
110152
}
111153
}
112154
}
113155

114-
close(SPKR_TEST) or die;
115-
close(WAV_TEST) or die;
156+
close(SPKR) or die;
157+
close(WAV) or die;
158+
close(LANG) or die;
159+
close(GENDER) or die;
160+
close(NAT) or die;
116161

117162
if (system(
118163
"cat $out_dir/trials_* | sort -u > $out_dir/trials") != 0) {

egs/voxceleb/v1/local/make_voxceleb1_oeh.pl

Lines changed: 60 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,10 @@
77
# Usage: make_voxceleb1.pl /export/voxceleb1 data/
88
# Create trial lists for Voxceleb1 original, Entire (E) and hard (H),
99
# with cleaned and non-cleaned versions
10+
# Attention:
11+
# - This script is for the old version of the dataset without anonymized speaker-ids
12+
# - This script assumes that the voxceleb1 dataset has all speaker directories
13+
# dumped in the same wav directory, NOT separated dev and test directories
1014

1115
if (@ARGV != 2) {
1216
print STDERR "Usage: $0 <path-to-voxceleb1> <path-to-data-dir>\n";
@@ -26,16 +30,46 @@
2630
my @trials_url = ("$url_base/veri_test.txt", "$url_base/veri_test2.txt", "$url_base/list_test_hard.txt", "$url_base/list_test_hard2.txt", "$url_base/list_test_all.txt", "$url_base/list_test_all2.txt");
2731
my @trials = ("trials_o", "trials_o_clean", "trials_h", "trials_h_clean", "trials_e", "trials_e_clean");
2832

29-
open(META_IN, "<", "$data_base/vox1_meta.csv") or die "Could not open the meta data file $data_base/vox1_meta.csv";
33+
my $meta_url = "https://www.openslr.org/resources/49/vox1_meta.csv";
34+
my $meta_path = "$data_base/vox1_meta.csv";
35+
if (! -e "$meta_path") {
36+
$meta_path = "$out_dir/vox1_meta.csv";
37+
system("wget -O $meta_path $meta_url");
38+
}
39+
40+
open(META_IN, "<", "$meta_path") or die "Could not open the meta data file $meta_path";
3041
my %id2spkr = ();
42+
my %spkr2gender = ();
43+
my %spkr2nation = ();
3144
while (<META_IN>) {
32-
chomp;
33-
my ($vox_id, $spkr_id, $gender, $nation, $set) = split;
34-
$id2spkr{$vox_id} = $spkr_id;
35-
45+
chomp;
46+
my ($vox_id, $spkr_id, $gender, $nation, $set) = split "\t";
47+
$id2spkr{$vox_id} = $spkr_id;
48+
$spkr2gender{$spkr_id} = $gender;
49+
$nation =~ s@ @-@g;
50+
$spkr2nation{$spkr_id} = $nation;
3651
}
3752
close(META_IN) or die;
3853

54+
my $lid_url = "https://www.robots.ox.ac.uk/~vgg/data/voxceleb/data_workshop_2021/lang_vox1_final.csv";
55+
my $lid_path = "$data_base/lang_vox1_final.csv";
56+
if (! -e "$lid_path") {
57+
$lid_path = "$out_dir/lang_vox1_final.csv";
58+
system("wget -O $lid_path $lid_url");
59+
}
60+
open(LID_IN, "<", "$lid_path") or die "Could not open the output file $lid_path";
61+
my %utt2lang = ();
62+
while (<LID_IN>) {
63+
chomp;
64+
my ($utt_id, $lang, $score) = split ',';
65+
my ($vox_id, $vid_id, $file_id) = split '/', $utt_id;
66+
my $spkr_id = $id2spkr{$vox_id};
67+
my $utt_id = "$spkr_id-$vid_id-00$file_id";
68+
$utt_id =~ s@\.wav$@@;
69+
$utt2lang{$utt_id} = $lang;
70+
}
71+
close(LID_IN) or die;
72+
3973
#download trials from voxceleb web page
4074
for($i = 0; $i <= $#trials; $i++) {
4175

@@ -81,8 +115,11 @@
81115
my @spkr_dirs = grep {-d "$data_base/voxceleb1_wav/$_" && ! /^\.{1,2}$/} readdir($dh);
82116
closedir $dh;
83117

84-
open(SPKR_TEST, ">", "$out_dir/utt2spk") or die "Could not open the output file $out_dir/utt2spk";
85-
open(WAV_TEST, ">", "$out_dir/wav.scp") or die "Could not open the output file $out_dir/wav.scp";
118+
open(SPKR, ">", "$out_dir/utt2spk") or die "Could not open the output file $out_dir/utt2spk";
119+
open(WAV, ">", "$out_dir/wav.scp") or die "Could not open the output file $out_dir/wav.scp";
120+
open(GENDER, ">", "$out_dir/spk2gender") or die "Could not open the output file $out_dir/spk2gender";
121+
open(NAT, ">", "$out_dir/spk2nation") or die "Could not open the output file $out_dir/spk2nation";
122+
open(LANG, ">", "$out_dir/utt2lang") or die "Could not open the output file $out_dir/utt2lang";
86123

87124
foreach (@spkr_dirs) {
88125
my $spkr_id = $_;
@@ -92,6 +129,9 @@
92129
if (exists $id2spkr{$spkr_id}) {
93130
$new_spkr_id = $id2spkr{$spkr_id};
94131
}
132+
print GENDER "$new_spkr_id $spkr2gender{$new_spkr_id}\n";
133+
print NAT "$new_spkr_id $spkr2nation{$new_spkr_id}\n";
134+
95135
opendir my $dh, "$data_base/voxceleb1_wav/$spkr_id/" or die "Cannot open directory: $!";
96136
my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh);
97137
closedir $dh;
@@ -101,13 +141,22 @@
101141
my $segment = substr($filename, 12, 7);
102142
my $wav = "$data_base/voxceleb1_wav/$spkr_id/$filename.wav";
103143
my $utt_id = "$new_spkr_id-$rec_id-$segment";
104-
print WAV_TEST "$utt_id", " $wav", "\n";
105-
print SPKR_TEST "$utt_id", " $new_spkr_id", "\n";
144+
print WAV "$utt_id", " $wav", "\n";
145+
print SPKR "$utt_id", " $new_spkr_id", "\n";
146+
if (exists $utt2lang{$utt_id}) {
147+
print LANG "$utt_id", " $utt2lang{$utt_id}", "\n";
148+
}
149+
else {
150+
print LANG "$utt_id N/A\n";
151+
}
106152
}
107153
}
108154

109-
close(SPKR_TEST) or die;
110-
close(WAV_TEST) or die;
155+
close(SPKR) or die;
156+
close(WAV) or die;
157+
close(LANG) or die;
158+
close(GENDER) or die;
159+
close(NAT) or die;
111160

112161
if (system(
113162
"cat $out_dir/trials_* | sort -u > $out_dir/trials") != 0) {

0 commit comments

Comments
 (0)