|
5 | 5 | # 2020 Jesus Villalba
|
6 | 6 | #
|
7 | 7 | # Usage: make_voxceleb1.pl /export/voxceleb1 data/
|
8 |
| -# Create trial lists for Voxceleb1 original, Entire (E) and hard (H), |
| 8 | +# Create trial lists for Voxceleb1 original, |
9 | 9 | # with cleaned and non-cleaned versions
|
| 10 | +# Attention: |
| 11 | +# - This script is for the old version of the dataset without anonymized speaker-ids |
| 12 | +# - This script assumes that the voxceleb1 dataset has all speaker directories |
| 13 | +# dumped in the same wav directory, NOT separated dev and test directories |
| 14 | + |
10 | 15 |
|
11 | 16 | if (@ARGV != 2) {
|
12 | 17 | print STDERR "Usage: $0 <path-to-voxceleb1> <path-to-data-dir>\n";
|
|
26 | 31 | my @trials_url = ("$url_base/veri_test.txt", "$url_base/veri_test2.txt");
|
27 | 32 | my @trials = ("trials_o", "trials_o_clean");
|
28 | 33 |
|
29 |
| -open(META_IN, "<", "$data_base/vox1_meta.csv") or die "Could not open the meta data file $data_base/vox1_meta.csv"; |
| 34 | +my $meta_url = "https://www.openslr.org/resources/49/vox1_meta.csv"; |
| 35 | +my $meta_path = "$data_base/vox1_meta.csv"; |
| 36 | +if (! -e "$meta_path") { |
| 37 | + $meta_path = "$out_dir/vox1_meta.csv"; |
| 38 | + system("wget -O $meta_path $meta_url"); |
| 39 | +} |
| 40 | + |
| 41 | +open(META_IN, "<", "$meta_path") or die "Could not open the meta data file $meta_path"; |
30 | 42 | my %id2spkr = ();
|
| 43 | +my %spkr2gender = (); |
| 44 | +my %spkr2nation = (); |
31 | 45 | while (<META_IN>) {
|
32 |
| - chomp; |
33 |
| - my ($vox_id, $spkr_id, $gender, $nation, $set) = split; |
34 |
| - $id2spkr{$vox_id} = $spkr_id; |
35 |
| - |
| 46 | + chomp; |
| 47 | + my ($vox_id, $spkr_id, $gender, $nation, $set) = split "\t"; |
| 48 | + $id2spkr{$vox_id} = $spkr_id; |
| 49 | + $spkr2gender{$spkr_id} = $gender; |
| 50 | + $nation =~ s@ @-@g; |
| 51 | + $spkr2nation{$spkr_id} = $nation; |
36 | 52 | }
|
37 | 53 | close(META_IN) or die;
|
38 | 54 |
|
| 55 | +my $lid_url = "https://www.robots.ox.ac.uk/~vgg/data/voxceleb/data_workshop_2021/lang_vox1_final.csv"; |
| 56 | +my $lid_path = "$data_base/lang_vox1_final.csv"; |
| 57 | +if (! -e "$lid_path") { |
| 58 | + $lid_path = "$out_dir/lang_vox1_final.csv"; |
| 59 | + system("wget -O $lid_path $lid_url"); |
| 60 | +} |
| 61 | +open(LID_IN, "<", "$lid_path") or die "Could not open the output file $lid_path"; |
| 62 | +my %utt2lang = (); |
| 63 | +while (<LID_IN>) { |
| 64 | + chomp; |
| 65 | + my ($utt_id, $lang, $score) = split ','; |
| 66 | + my ($vox_id, $vid_id, $file_id) = split '/', $utt_id; |
| 67 | + my $spkr_id = $id2spkr{$vox_id}; |
| 68 | + my $utt_id = "$spkr_id-$vid_id-00$file_id"; |
| 69 | + $utt_id =~ s@\.wav$@@; |
| 70 | + $utt2lang{$utt_id} = $lang; |
| 71 | +} |
| 72 | +close(LID_IN) or die; |
| 73 | + |
39 | 74 | #download trials from voxceleb web page
|
40 |
| -my %valid_utts = (); |
41 | 75 | for($i = 0; $i <= $#trials; $i++) {
|
42 | 76 |
|
43 | 77 | my $file_i = "$out_dir/$trials_basename[$i]";
|
|
70 | 104 | $target = "target";
|
71 | 105 | }
|
72 | 106 | print TRIAL_OUT "$utt_id1 $utt_id2 $target\n";
|
73 |
| - $valid_utts{$utt_id1} = 1; |
74 |
| - $valid_utts{$utt_id2} = 1; |
75 | 107 | }
|
76 | 108 |
|
77 | 109 | close(TRIAL_IN) or die;
|
|
84 | 116 | my @spkr_dirs = grep {-d "$data_base/voxceleb1_wav/$_" && ! /^\.{1,2}$/} readdir($dh);
|
85 | 117 | closedir $dh;
|
86 | 118 |
|
87 |
| -open(SPKR_TEST, ">", "$out_dir/utt2spk") or die "Could not open the output file $out_dir/utt2spk"; |
88 |
| -open(WAV_TEST, ">", "$out_dir/wav.scp") or die "Could not open the output file $out_dir/wav.scp"; |
| 119 | +open(SPKR, ">", "$out_dir/utt2spk") or die "Could not open the output file $out_dir/utt2spk"; |
| 120 | +open(WAV, ">", "$out_dir/wav.scp") or die "Could not open the output file $out_dir/wav.scp"; |
| 121 | +open(GENDER, ">", "$out_dir/spk2gender") or die "Could not open the output file $out_dir/spk2gender"; |
| 122 | +open(NAT, ">", "$out_dir/spk2nation") or die "Could not open the output file $out_dir/spk2nation"; |
| 123 | +open(LANG, ">", "$out_dir/utt2lang") or die "Could not open the output file $out_dir/utt2lang"; |
89 | 124 |
|
90 | 125 | foreach (@spkr_dirs) {
|
91 | 126 | my $spkr_id = $_;
|
|
95 | 130 | if (exists $id2spkr{$spkr_id}) {
|
96 | 131 | $new_spkr_id = $id2spkr{$spkr_id};
|
97 | 132 | }
|
| 133 | + print GENDER "$new_spkr_id $spkr2gender{$new_spkr_id}\n"; |
| 134 | + print NAT "$new_spkr_id $spkr2nation{$new_spkr_id}\n"; |
| 135 | + |
98 | 136 | opendir my $dh, "$data_base/voxceleb1_wav/$spkr_id/" or die "Cannot open directory: $!";
|
99 | 137 | my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh);
|
100 | 138 | closedir $dh;
|
|
104 | 142 | my $segment = substr($filename, 12, 7);
|
105 | 143 | my $wav = "$data_base/voxceleb1_wav/$spkr_id/$filename.wav";
|
106 | 144 | my $utt_id = "$new_spkr_id-$rec_id-$segment";
|
107 |
| - if (exists $valid_utts{$utt_id}) { |
108 |
| - print WAV_TEST "$utt_id", " $wav", "\n"; |
109 |
| - print SPKR_TEST "$utt_id", " $new_spkr_id", "\n"; |
| 145 | + print WAV "$utt_id", " $wav", "\n"; |
| 146 | + print SPKR "$utt_id", " $new_spkr_id", "\n"; |
| 147 | + if (exists $utt2lang{$utt_id}) { |
| 148 | + print LANG "$utt_id", " $utt2lang{$utt_id}", "\n"; |
| 149 | + } |
| 150 | + else { |
| 151 | + print LANG "$utt_id N/A\n"; |
110 | 152 | }
|
111 | 153 | }
|
112 | 154 | }
|
113 | 155 |
|
114 |
| -close(SPKR_TEST) or die; |
115 |
| -close(WAV_TEST) or die; |
| 156 | +close(SPKR) or die; |
| 157 | +close(WAV) or die; |
| 158 | +close(LANG) or die; |
| 159 | +close(GENDER) or die; |
| 160 | +close(NAT) or die; |
116 | 161 |
|
117 | 162 | if (system(
|
118 | 163 | "cat $out_dir/trials_* | sort -u > $out_dir/trials") != 0) {
|
|
0 commit comments