|
| 1 | +#!/usr/bin/perl |
| 2 | +# |
| 3 | +# Copyright 2018 Ewald Enzinger |
| 4 | +# 2018 David Snyder |
| 5 | +# 2020 Jesus Villalba |
| 6 | +# |
| 7 | +# Usage: make_voxceleb1.pl /export/voxceleb1 data/ |
| 8 | +# Create trial lists for Voxceleb1 original, |
| 9 | +# with cleaned and non-cleaned versions |
| 10 | +# Attention: |
| 11 | +# - This script is for the recent version of the dataset |
| 12 | +# - This script assumes that the voxceleb1 dataset has all speaker directories |
| 13 | +# dumped in the same wav directory, NOT separated dev and test directories |
| 14 | + |
| 15 | +if (@ARGV != 2) { |
| 16 | + print STDERR "Usage: $0 <path-to-voxceleb1> <path-to-data-dir>\n"; |
| 17 | + print STDERR "e.g. $0 /export/voxceleb1 data/\n"; |
| 18 | + exit(1); |
| 19 | +} |
| 20 | + |
| 21 | +($data_base, $out_dir) = @ARGV; |
| 22 | +my $out_dir = "$out_dir/voxceleb1_test"; |
| 23 | + |
| 24 | +if (system("mkdir -p $out_dir") != 0) { |
| 25 | + die "Error making directory $out_dir"; |
| 26 | +} |
| 27 | + |
| 28 | +my $url_base="http://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta"; |
| 29 | +my @trials_basename = ("very_test.txt", "very_test2.txt", "list_test_hard.txt", "list_test_hard2.txt", "list_test_all.txt", "list_test_all2.txt"); |
| 30 | +my @trials_url = ("$url_base/veri_test.txt", "$url_base/veri_test2.txt"); |
| 31 | +my @trials = ("trials_o", "trials_o_clean"); |
| 32 | + |
| 33 | +my $meta_url = "https://www.openslr.org/resources/49/vox1_meta.csv"; |
| 34 | +my $meta_path = "$data_base/vox1_meta.csv"; |
| 35 | +if (! -e "$meta_path") { |
| 36 | + $meta_path = "$out_dir/vox1_meta.csv"; |
| 37 | + system("wget -O $meta_path $meta_url"); |
| 38 | +} |
| 39 | + |
| 40 | +open(META_IN, "<", "$meta_path") or die "Could not open the meta data file $meta_path"; |
| 41 | +my %id2spkr = (); |
| 42 | +my %spkr2gender = (); |
| 43 | +my %spkr2nation = (); |
| 44 | +while (<META_IN>) { |
| 45 | + chomp; |
| 46 | + my ($vox_id, $spkr_id, $gender, $nation, $set) = split "\t"; |
| 47 | + $spkr2gender{$vox_id} = $gender; |
| 48 | + $nation =~ s@ @-@g; |
| 49 | + $spkr2nation{$vox_id} = $nation; |
| 50 | +} |
| 51 | +close(META_IN) or die; |
| 52 | + |
| 53 | +my $lid_url = "https://www.robots.ox.ac.uk/~vgg/data/voxceleb/data_workshop_2021/lang_vox1_final.csv"; |
| 54 | +my $lid_path = "$data_base/lang_vox1_final.csv"; |
| 55 | +if (! -e "$lid_path") { |
| 56 | + $lid_path = "$out_dir/lang_vox1_final.csv"; |
| 57 | + system("wget -O $lid_path $lid_url"); |
| 58 | +} |
| 59 | +open(LID_IN, "<", "$lid_path") or die "Could not open the output file $lid_path"; |
| 60 | +my %utt2lang = (); |
| 61 | +while (<LID_IN>) { |
| 62 | + chomp; |
| 63 | + my ($utt_id, $lang, $score) = split ','; |
| 64 | + my ($vox_id, $vid_id, $file_id) = split '/', $utt_id; |
| 65 | + my $utt_id = "$vox_id-$vid_id-$file_id"; |
| 66 | + $utt_id =~ s@\.wav$@@; |
| 67 | + $utt2lang{$utt_id} = $lang; |
| 68 | +} |
| 69 | +close(LID_IN) or die; |
| 70 | + |
| 71 | +#download trials from voxceleb web page |
| 72 | +for($i = 0; $i <= $#trials; $i++) { |
| 73 | + |
| 74 | + my $file_i = "$out_dir/$trials_basename[$i]"; |
| 75 | + my $url_i = $trials_url[$i]; |
| 76 | + my $trial_i = "$out_dir/$trials[$i]"; |
| 77 | + if (! -e $file_i) { |
| 78 | + system("wget -O $file_i $url_i"); |
| 79 | + } |
| 80 | + #mapping from new speaker ids and file-names to old ones |
| 81 | + open(TRIAL_IN, "<", "$file_i") or die "Could not open the verification trials file $file_i"; |
| 82 | + open(TRIAL_OUT, ">", "$trial_i") or die "Could not open the output file $trial_i"; |
| 83 | + while (<TRIAL_IN>) { |
| 84 | + chomp; |
| 85 | + my ($tar_or_non, $path1, $path2) = split; |
| 86 | + |
| 87 | + # Create entry for left-hand side of trial |
| 88 | + my ($spkr_id, $rec_id, $segment) = split('/', $path1); |
| 89 | + $segment =~ s/\.wav$//; |
| 90 | + my $utt_id1 = "$spkr_id-$rec_id-$segment"; |
| 91 | + |
| 92 | + # Create entry for right-hand side of trial |
| 93 | + my ($spkr_id, $rec_id, $segment) = split('/', $path2); |
| 94 | + $segment =~ s/\.wav$//; |
| 95 | + my $utt_id2 = "$spkr_id-$rec_id-$segment"; |
| 96 | + |
| 97 | + my $target = "nontarget"; |
| 98 | + if ($tar_or_non eq "1") { |
| 99 | + $target = "target"; |
| 100 | + } |
| 101 | + print TRIAL_OUT "$utt_id1 $utt_id2 $target\n"; |
| 102 | + } |
| 103 | + |
| 104 | + close(TRIAL_IN) or die; |
| 105 | + close(TRIAL_OUT) or die; |
| 106 | + |
| 107 | +} |
| 108 | + |
| 109 | +my $wav_dir = "$data_base/wav"; |
| 110 | +opendir my $dh, "$wav_dir" or die "Cannot open directory: $!"; |
| 111 | +my @spkr_dirs = grep {-d "$wav_dir/$_" && ! /^\.{1,2}$/ || -l "$wav_dir/$_" } readdir($dh); |
| 112 | +closedir $dh; |
| 113 | + |
| 114 | +open(SPKR, ">", "$out_dir/utt2spk") or die "Could not open the output file $out_dir/utt2spk"; |
| 115 | +open(WAV, ">", "$out_dir/wav.scp") or die "Could not open the output file $out_dir/wav.scp"; |
| 116 | +open(GENDER, ">", "$out_dir/spk2gender") or die "Could not open the output file $out_dir/spk2gender"; |
| 117 | +open(NAT, ">", "$out_dir/spk2nation") or die "Could not open the output file $out_dir/spk2nation"; |
| 118 | +open(LANG, ">", "$out_dir/utt2lang") or die "Could not open the output file $out_dir/utt2lang"; |
| 119 | + |
| 120 | +foreach (@spkr_dirs) { |
| 121 | + my $spkr_id = $_; |
| 122 | + |
| 123 | + print GENDER "$spkr_id $spkr2gender{$spkr_id}\n"; |
| 124 | + print NAT "$spkr_id $spkr2nation{$spkr_id}\n"; |
| 125 | + |
| 126 | + my $spkr_dir = "$wav_dir/$spkr_id"; |
| 127 | + opendir my $dh, "$spkr_dir" or die "Cannot open directory: $!"; |
| 128 | + my @vid_dirs = grep {-d "$spkr_dir/$_" && ! /^\.{1,2}$/ } readdir($dh); |
| 129 | + my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh); |
| 130 | + closedir $dh; |
| 131 | + foreach (@vid_dirs) { |
| 132 | + my $vid_id = $_; |
| 133 | + my $vid_dir = "$spkr_dir/$vid_id"; |
| 134 | + opendir my $dh, "$vid_dir" or die "Cannot open directory: $!"; |
| 135 | + my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh); |
| 136 | + closedir $dh; |
| 137 | + foreach (@files) { |
| 138 | + my $segment = $_; |
| 139 | + my $wav = "$vid_dir/$segment.wav"; |
| 140 | + my $utt_id = "$spkr_id-$vid_id-$segment"; |
| 141 | + if($fs == 8){ |
| 142 | + $wav = "sox " . $wav . " -t wav -r 8k - |"; |
| 143 | + } |
| 144 | + print WAV "$utt_id", " $wav", "\n"; |
| 145 | + print SPKR "$utt_id", " $spkr_id", "\n"; |
| 146 | + if (exists $utt2lang{$utt_id}) { |
| 147 | + print LANG "$utt_id", " $utt2lang{$utt_id}", "\n"; |
| 148 | + } |
| 149 | + else { |
| 150 | + print LANG "$utt_id N/A\n"; |
| 151 | + } |
| 152 | + } |
| 153 | + } |
| 154 | +} |
| 155 | + |
| 156 | +# foreach (@spkr_dirs) { |
| 157 | +# my $spkr_id = $_; |
| 158 | +# my $new_spkr_id = $spkr_id; |
| 159 | +# # If we're using a newer version of VoxCeleb1, we need to "deanonymize" |
| 160 | +# # the speaker labels. |
| 161 | +# if (exists $id2spkr{$spkr_id}) { |
| 162 | +# $new_spkr_id = $id2spkr{$spkr_id}; |
| 163 | +# } |
| 164 | +# print GENDER "$new_spkr_id $spkr2gender{$new_spkr_id}\n"; |
| 165 | +# print NAT "$new_spkr_id $spkr2nation{$new_spkr_id}\n"; |
| 166 | + |
| 167 | +# opendir my $dh, "$wav_dir/$spkr_id/" or die "Cannot open directory: $!"; |
| 168 | +# my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh); |
| 169 | +# closedir $dh; |
| 170 | +# foreach (@files) { |
| 171 | +# my $filename = $_; |
| 172 | +# my $rec_id = substr($filename, 0, 11); |
| 173 | +# my $segment = substr($filename, 12, 7); |
| 174 | +# my $wav = "$data_base/voxceleb1_wav/$spkr_id/$filename.wav"; |
| 175 | +# my $utt_id = "$new_spkr_id-$rec_id-$segment"; |
| 176 | +# print WAV "$utt_id", " $wav", "\n"; |
| 177 | +# print SPKR "$utt_id", " $new_spkr_id", "\n"; |
| 178 | +# if (exists $utt2lang{$utt_id}) { |
| 179 | +# print LANG "$utt_id", " $utt2lang{$utt_id}", "\n"; |
| 180 | +# } |
| 181 | +# else { |
| 182 | +# print LANG "$utt_id N/A\n"; |
| 183 | +# } |
| 184 | +# } |
| 185 | +# } |
| 186 | + |
| 187 | +close(SPKR) or die; |
| 188 | +close(WAV) or die; |
| 189 | +close(LANG) or die; |
| 190 | +close(GENDER) or die; |
| 191 | +close(NAT) or die; |
| 192 | + |
| 193 | +if (system( |
| 194 | + "cat $out_dir/trials_* | sort -u > $out_dir/trials") != 0) { |
| 195 | + die "Error creating trials file in directory $out_dir"; |
| 196 | +} |
| 197 | + |
| 198 | +if (system( |
| 199 | + "awk '{ print \$1,\$1 }' $out_dir/trials | sort -u > $out_dir/utt2model") != 0) { |
| 200 | + die "Error creating utt2model file in directory $out_dir"; |
| 201 | +} |
| 202 | + |
| 203 | +if (system( |
| 204 | + "utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) { |
| 205 | + die "Error creating spk2utt file in directory $out_dir"; |
| 206 | +} |
| 207 | +system("env LC_COLLATE=C utils/fix_data_dir.sh $out_dir"); |
| 208 | +if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) { |
| 209 | + die "Error validating directory $out_dir"; |
| 210 | +} |
| 211 | + |
0 commit comments